feat: Implement hash dictionary
This commit is contained in:
parent
cf2503214b
commit
75c459672c
10 changed files with 883 additions and 8 deletions
116
Cargo.lock
generated
116
Cargo.lock
generated
|
@ -55,6 +55,18 @@ version = "1.3.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.2.1"
|
||||
|
@ -81,6 +93,7 @@ checksum = "335867764ed2de42325fafe6d18b8af74ba97ee0c590fa016f157535b42ab04b"
|
|||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"clap_lex",
|
||||
"once_cell",
|
||||
"strsim",
|
||||
|
@ -89,6 +102,19 @@ dependencies = [
|
|||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.3.0"
|
||||
|
@ -125,17 +151,45 @@ dependencies = [
|
|||
"tracing-error",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-async"
|
||||
version = "1.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c19b33b32fd48f83388821bd8f534b59e1b1ffd5c6c83771d1b23abd3dac2685"
|
||||
dependencies = [
|
||||
"bstr",
|
||||
"cfg-if",
|
||||
"csv-core",
|
||||
"futures",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtmt"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"color-eyre",
|
||||
"csv-async",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"nanorand",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
|
@ -264,6 +318,12 @@ version = "0.3.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
|
@ -288,6 +348,12 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
|
@ -416,6 +482,30 @@ version = "0.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.47"
|
||||
|
@ -482,6 +572,32 @@ version = "0.1.21"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.147"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.147"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.4"
|
||||
|
|
|
@ -4,15 +4,17 @@ version = "0.1.0"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.0.15", features = ["color", "std", "cargo", "unicode"] }
|
||||
clap = { version = "4.0.15", features = ["color", "derive", "std", "cargo", "unicode"] }
|
||||
color-eyre = "0.6.2"
|
||||
csv-async = { version = "1.2.4", features = ["tokio", "serde"] }
|
||||
futures = "0.3.25"
|
||||
futures-util = "0.3.24"
|
||||
glob = "0.3.0"
|
||||
nanorand = "0.7.0"
|
||||
pin-project-lite = "0.2.9"
|
||||
tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util"] }
|
||||
tokio-stream = { version = "0.1.11", features = ["fs"] }
|
||||
serde = { version = "1.0.147", features = ["derive"] }
|
||||
tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util", "io-std"] }
|
||||
tokio-stream = { version = "0.1.11", features = ["fs", "io-util"] }
|
||||
tracing = { version = "0.1.37", features = ["async-await"] }
|
||||
tracing-error = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3.16", features = ["env-filter"] }
|
||||
|
|
152
src/bin/cmd/dictionary.rs
Normal file
152
src/bin/cmd/dictionary.rs
Normal file
|
@ -0,0 +1,152 @@
|
|||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use color_eyre::eyre::{Context, Result};
|
||||
use color_eyre::{Help, SectionExt};
|
||||
use dtmt::lookup_hash;
|
||||
use dtmt::murmur::HashGroup;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::sync::RwLock;
|
||||
use tokio_stream::wrappers::LinesStream;
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
pub(crate) fn command_definition() -> Command {
|
||||
Command::new("dictionary")
|
||||
.about("Manipulate a hash dictionary file.")
|
||||
.subcommand(
|
||||
Command::new("lookup")
|
||||
.about("Lookup a hash in the dictionary")
|
||||
.arg(Arg::new("hash").help("The hash to look up").required(true))
|
||||
.arg(
|
||||
Arg::new("group")
|
||||
.help(
|
||||
"Check each group for a match. \
|
||||
If no group is specified, all groups are checked.",
|
||||
)
|
||||
.short('g')
|
||||
.long("group")
|
||||
.action(ArgAction::Append)
|
||||
.value_parser(value_parser!(HashGroup)),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("add")
|
||||
.about(
|
||||
"Add strings to the dictionary. \
|
||||
Strings are read line by line from the given file.",
|
||||
)
|
||||
.arg(
|
||||
Arg::new("group")
|
||||
.help("The dictionary group to put these strings in.")
|
||||
.short('g')
|
||||
.long("group")
|
||||
.value_parser(value_parser!(HashGroup))
|
||||
.default_value("other"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("file")
|
||||
.help("Path to a file to read strings from.")
|
||||
.required(true)
|
||||
.value_parser(value_parser!(PathBuf)),
|
||||
),
|
||||
)
|
||||
.subcommand(Command::new("save").about(
|
||||
"Save back the currently loaded dictionary, with hashes pre-computed.\
|
||||
Pre-computing hashes speeds up loading large dictionaries, as they would \
|
||||
otherwise need to be computed on the fly.",
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(crate) async fn run(ctx: Arc<RwLock<dtmt::Context>>, matches: &ArgMatches) -> Result<()> {
|
||||
match matches.subcommand() {
|
||||
Some(("lookup", sub_matches)) => {
|
||||
let hash = sub_matches
|
||||
.get_one::<u64>("hash")
|
||||
.expect("required argument not found");
|
||||
|
||||
let groups = sub_matches
|
||||
.get_many::<HashGroup>("group")
|
||||
.unwrap_or_default();
|
||||
|
||||
for group in groups {
|
||||
let value = lookup_hash(ctx.clone(), *hash, *group).await;
|
||||
println!("{}", value);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Some(("add", sub_matches)) => {
|
||||
let path = sub_matches
|
||||
.get_one::<PathBuf>("file")
|
||||
.expect("required argument not found");
|
||||
let group = sub_matches
|
||||
.get_one::<HashGroup>("group")
|
||||
.expect("required argument not found");
|
||||
|
||||
let r: BufReader<Box<dyn tokio::io::AsyncRead + std::marker::Unpin>> = if let Some(name) = path.file_name() && name == "-" {
|
||||
let f = tokio::io::stdin();
|
||||
BufReader::new(Box::new(f))
|
||||
} else {
|
||||
let f = File::open(&path).await?;
|
||||
BufReader::new(Box::new(f))
|
||||
};
|
||||
|
||||
let lines: Vec<_> = LinesStream::new(r.lines()).collect().await;
|
||||
{
|
||||
let mut ctx = ctx.write().await;
|
||||
for line in lines.into_iter() {
|
||||
ctx.lookup.add(line?, *group);
|
||||
}
|
||||
}
|
||||
|
||||
let out_path = matches
|
||||
.get_one::<PathBuf>("dictionary")
|
||||
.expect("no default value for 'dictionary' parameter");
|
||||
let f = File::create(out_path)
|
||||
.await
|
||||
.wrap_err("Failed to open dictionary file")
|
||||
.with_suggestion(|| {
|
||||
format!(
|
||||
"Make sure the parent directories of '{}' exist and are writable",
|
||||
out_path.display()
|
||||
)
|
||||
})
|
||||
.with_section(|| out_path.display().to_string().header("Path:"))?;
|
||||
|
||||
ctx.read()
|
||||
.await
|
||||
.lookup
|
||||
.to_csv(f)
|
||||
.await
|
||||
.wrap_err("Failed to write dictionary to disk")
|
||||
}
|
||||
Some(("save", _)) => {
|
||||
let out_path = matches
|
||||
.get_one::<PathBuf>("dictionary")
|
||||
.expect("no default value for 'dictionary' parameter");
|
||||
let f = File::create(out_path)
|
||||
.await
|
||||
.wrap_err("Failed to open dictionary file")
|
||||
.with_suggestion(|| {
|
||||
format!(
|
||||
"Make sure the parent directories of '{}' exist and are writable",
|
||||
out_path.display()
|
||||
)
|
||||
})
|
||||
.with_section(|| out_path.display().to_string().header("Path:"))?;
|
||||
|
||||
ctx.read()
|
||||
.await
|
||||
.lookup
|
||||
.to_csv(f)
|
||||
.await
|
||||
.wrap_err("Failed to write dictionary to disk")
|
||||
}
|
||||
_ => unreachable!(
|
||||
"clap is configured to require a subcommand, and they're all handled above"
|
||||
),
|
||||
}
|
||||
}
|
|
@ -1,20 +1,25 @@
|
|||
#![feature(io_error_more)]
|
||||
#![feature(let_chains)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use clap::parser::ValueSource;
|
||||
use clap::value_parser;
|
||||
use clap::{command, Arg, ArgAction};
|
||||
use color_eyre::eyre::Result;
|
||||
use color_eyre::eyre::{Context, Result};
|
||||
use color_eyre::{Help, SectionExt};
|
||||
use tokio::fs::File;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing_error::ErrorLayer;
|
||||
use tracing_subscriber::prelude::*;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
use dtmt::Context;
|
||||
|
||||
mod cmd {
|
||||
pub mod build;
|
||||
pub mod bundle;
|
||||
pub mod dictionary;
|
||||
pub mod murmur;
|
||||
pub mod new;
|
||||
mod util;
|
||||
|
@ -38,8 +43,20 @@ async fn main() -> Result<()> {
|
|||
May be specified multiple times.",
|
||||
),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dictionary")
|
||||
.help(
|
||||
"Path to a dictionary file CSV format used to look up pre-computed murmur hashes.\
|
||||
\nWill default to `dictionary.csv` in the current directory.",
|
||||
)
|
||||
.default_value("dictionary.csv")
|
||||
.long("dict")
|
||||
.global(true)
|
||||
.value_parser(value_parser!(PathBuf)),
|
||||
)
|
||||
.subcommand(cmd::build::command_definition())
|
||||
.subcommand(cmd::bundle::command_definition())
|
||||
.subcommand(cmd::dictionary::command_definition())
|
||||
.subcommand(cmd::murmur::command_definition())
|
||||
.subcommand(cmd::new::command_definition())
|
||||
.subcommand(cmd::watch::command_definition())
|
||||
|
@ -57,15 +74,50 @@ async fn main() -> Result<()> {
|
|||
.init();
|
||||
}
|
||||
|
||||
let ctx = Context::new();
|
||||
let ctx = dtmt::Context::new();
|
||||
let ctx = Arc::new(RwLock::new(ctx));
|
||||
|
||||
{
|
||||
let path = matches
|
||||
.get_one::<PathBuf>("dictionary")
|
||||
.cloned()
|
||||
.expect("no default value for 'dictionary' parameter");
|
||||
let is_default = matches.value_source("dictionary") == Some(ValueSource::DefaultValue);
|
||||
let ctx = ctx.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut ctx = ctx.write().await;
|
||||
let res = File::open(&path)
|
||||
.await
|
||||
.wrap_err("Failed to open dictionary file")
|
||||
.with_section(|| path.display().to_string().header("Path:"));
|
||||
|
||||
let f = match res {
|
||||
Ok(f) => f,
|
||||
Err(err) => {
|
||||
if is_default {
|
||||
return;
|
||||
}
|
||||
tracing::error!("{}", err);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let r = BufReader::new(f);
|
||||
if let Err(err) = ctx.lookup.from_csv(r).await {
|
||||
tracing::error!("{}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
match matches.subcommand() {
|
||||
Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?,
|
||||
Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?,
|
||||
Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,
|
||||
Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?,
|
||||
Some(("watch", sub_matches)) => cmd::watch::run(ctx, sub_matches).await?,
|
||||
Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?,
|
||||
_ => unreachable!(
|
||||
"clap is configured to require a subcommand, and they're all handled above"
|
||||
),
|
||||
|
|
|
@ -1,10 +1,20 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::murmur::{Dictionary, HashGroup, Murmur32, Murmur64};
|
||||
|
||||
pub struct Context {
|
||||
pub lookup: Dictionary,
|
||||
pub oodle: Option<String>,
|
||||
}
|
||||
|
||||
impl Context {
|
||||
pub fn new() -> Self {
|
||||
Self { oodle: None }
|
||||
Self {
|
||||
lookup: Dictionary::new(),
|
||||
oodle: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,3 +23,31 @@ impl Default for Context {
|
|||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn lookup_hash<M>(ctx: Arc<RwLock<Context>>, hash: M, group: HashGroup) -> String
|
||||
where
|
||||
M: Into<Murmur64>,
|
||||
{
|
||||
let hash = hash.into();
|
||||
if let Some(s) = ctx.read().await.lookup.lookup(hash, group) {
|
||||
tracing::debug!(%hash, string = s, "Murmur64 lookup successful");
|
||||
s.to_owned()
|
||||
} else {
|
||||
tracing::debug!(%hash, "Murmur64 lookup failed");
|
||||
format!("{:016X}", hash)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn lookup_hash_short<M>(ctx: Arc<RwLock<Context>>, hash: M, group: HashGroup) -> String
|
||||
where
|
||||
M: Into<Murmur32>,
|
||||
{
|
||||
let hash = hash.into();
|
||||
if let Some(s) = ctx.read().await.lookup.lookup_short(hash, group) {
|
||||
tracing::debug!(%hash, string = s, "Murmur32 lookup successful");
|
||||
s.to_owned()
|
||||
} else {
|
||||
tracing::debug!(%hash, "Murmur32 lookup failed");
|
||||
format!("{:08X}", hash)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
mod bundle;
|
||||
mod context;
|
||||
pub mod murmur;
|
||||
mod oodle;
|
||||
|
||||
pub use bundle::decompress;
|
||||
pub use context::lookup_hash;
|
||||
pub use context::lookup_hash_short;
|
||||
pub use context::Context;
|
||||
|
|
154
src/murmur/dictionary.rs
Normal file
154
src/murmur/dictionary.rs
Normal file
|
@ -0,0 +1,154 @@
|
|||
use clap::ValueEnum;
|
||||
use color_eyre::{eyre::Context, Help, Result, SectionExt};
|
||||
use csv_async::{AsyncDeserializer, AsyncSerializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use super::{murmurhash64, Murmur32, Murmur64, SEED};
|
||||
|
||||
#[derive(Copy, Clone, Deserialize, PartialEq, Serialize, ValueEnum)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum HashGroup {
|
||||
Filename,
|
||||
Filetype,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for HashGroup {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
HashGroup::Filename => write!(f, "filename"),
|
||||
HashGroup::Filetype => write!(f, "filetype"),
|
||||
HashGroup::Other => write!(f, "other"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HashGroup {
|
||||
fn default() -> Self {
|
||||
Self::Other
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct Row {
|
||||
// NOTE: The order of fields is important, as the CSV serializer copies that.
|
||||
value: String,
|
||||
long: Option<Murmur64>,
|
||||
short: Option<Murmur32>,
|
||||
#[serde(default)]
|
||||
group: HashGroup,
|
||||
}
|
||||
|
||||
struct Entry {
|
||||
value: String,
|
||||
long: Murmur64,
|
||||
short: Murmur32,
|
||||
group: HashGroup,
|
||||
}
|
||||
|
||||
pub struct Dictionary {
|
||||
entries: Vec<Entry>,
|
||||
}
|
||||
|
||||
impl Default for Dictionary {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Dictionary {
|
||||
pub fn new() -> Self {
|
||||
Self { entries: vec![] }
|
||||
}
|
||||
|
||||
pub async fn from_csv<R>(&mut self, r: R) -> Result<()>
|
||||
where
|
||||
R: AsyncRead + std::marker::Unpin + std::marker::Send,
|
||||
{
|
||||
let r = AsyncDeserializer::from_reader(r);
|
||||
let mut records = r.into_deserialize::<Row>();
|
||||
|
||||
while let Some(row) = records.next().await {
|
||||
let record = row?;
|
||||
let value = record.value;
|
||||
let long = record.long.unwrap_or_else(|| {
|
||||
Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64))
|
||||
});
|
||||
let short = record
|
||||
.short
|
||||
.unwrap_or_else(|| Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED)));
|
||||
|
||||
let entry = Entry {
|
||||
value,
|
||||
long,
|
||||
short,
|
||||
group: record.group,
|
||||
};
|
||||
|
||||
self.entries.push(entry);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn to_csv<W>(&self, w: W) -> Result<()>
|
||||
where
|
||||
W: AsyncWrite + std::marker::Unpin,
|
||||
{
|
||||
let mut w = AsyncSerializer::from_writer(w);
|
||||
for (i, entry) in self.entries.iter().enumerate() {
|
||||
let row = Row {
|
||||
long: Some(entry.long),
|
||||
short: Some(entry.short),
|
||||
value: entry.value.clone(),
|
||||
group: entry.group,
|
||||
};
|
||||
|
||||
w.serialize(row)
|
||||
.await
|
||||
.wrap_err("Failed to serialize entry")
|
||||
.with_section(|| {
|
||||
let s = format!(
|
||||
"Index: {}\nValue: {}\n64bit: {:#016X}\n32bit: {:#08X}\nGroup: {}",
|
||||
i, entry.value, entry.long, entry.short, entry.group
|
||||
);
|
||||
|
||||
s.header("Entry")
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add(&mut self, value: String, group: HashGroup) {
|
||||
let long = Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64));
|
||||
let short = Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED));
|
||||
|
||||
let entry = Entry {
|
||||
long,
|
||||
short,
|
||||
value,
|
||||
group,
|
||||
};
|
||||
|
||||
self.entries.push(entry);
|
||||
}
|
||||
|
||||
pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> {
|
||||
self.entries
|
||||
.iter()
|
||||
.filter(|e| e.group == group)
|
||||
.find(|e| e.long == hash)
|
||||
.map(|e| &e.value)
|
||||
}
|
||||
|
||||
pub fn lookup_short(&self, hash: Murmur32, group: HashGroup) -> Option<&String> {
|
||||
self.entries
|
||||
.iter()
|
||||
.filter(|e| e.group == group)
|
||||
.find(|e| e.short == hash)
|
||||
.map(|e| &e.value)
|
||||
}
|
||||
}
|
172
src/murmur/mod.rs
Normal file
172
src/murmur/mod.rs
Normal file
|
@ -0,0 +1,172 @@
|
|||
use std::fmt;
|
||||
use std::num::ParseIntError;
|
||||
use std::ops::Deref;
|
||||
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{Deserializer, Serializer};
|
||||
|
||||
mod dictionary;
|
||||
// Currently unused
|
||||
// mod murmurhash32;
|
||||
mod murmurhash64;
|
||||
|
||||
pub const SEED: u32 = 0;
|
||||
|
||||
pub use dictionary::Dictionary;
|
||||
pub use dictionary::HashGroup;
|
||||
pub use murmurhash64::hash;
|
||||
pub use murmurhash64::hash32;
|
||||
pub use murmurhash64::hash_inverse as inverse;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct Murmur64(u64);
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct Murmur32(u32);
|
||||
|
||||
impl Deref for Murmur64 {
|
||||
type Target = u64;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Murmur64 {
|
||||
fn from(value: u64) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for Murmur64 {
|
||||
type Error = ParseIntError;
|
||||
|
||||
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
||||
u64::from_str_radix(value, 16).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::UpperHex for Murmur64 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::UpperHex::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Murmur64 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::UpperHex::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for Murmur64 {
|
||||
type Value = Self;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str(
|
||||
"an usinged 64 bit integer \
|
||||
or a string in hexadecimal format encoding such an integer",
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E> {
|
||||
Ok(Self::from(value))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
match Murmur64::try_from(value) {
|
||||
Ok(hash) => Ok(hash),
|
||||
Err(err) => Err(E::custom(err)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Murmur64 {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_any(Self(0))
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Murmur64 {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serializer.serialize_str(&format!("{:016X}", self))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for Murmur32 {
|
||||
fn from(value: u32) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for Murmur32 {
|
||||
type Error = ParseIntError;
|
||||
|
||||
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
||||
u32::from_str_radix(value, 8).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::UpperHex for Murmur32 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::UpperHex::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Murmur32 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::UpperHex::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Murmur32 {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serializer.serialize_str(&format!("{:08X}", self))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for Murmur32 {
|
||||
type Value = Self;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str(
|
||||
"an usinged 32 bit integer \
|
||||
or a string in hexadecimal format encoding such an integer",
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_u32<E>(self, value: u32) -> Result<Self::Value, E> {
|
||||
Ok(Self::from(value))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
match Murmur32::try_from(value) {
|
||||
Ok(hash) => Ok(hash),
|
||||
Err(err) => Err(E::custom(err)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Murmur32 {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_any(Self(0))
|
||||
}
|
||||
}
|
64
src/murmur/murmurhash32.rs
Normal file
64
src/murmur/murmurhash32.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (C) 2022 Lucas Schwiderski
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
//
|
||||
// Adapted from https://github.com/badboy/murmurhash64-rs
|
||||
|
||||
// 'M' and 'R' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
const M: u32 = 0x5bd1e995;
|
||||
const R: u8 = 24;
|
||||
|
||||
pub fn hash(key: &[u8], seed: u32) -> u32 {
|
||||
let len = key.len();
|
||||
|
||||
// Initialize hash to a "random" value
|
||||
let mut h: u32 = seed ^ ((len as u32).wrapping_mul(M));
|
||||
|
||||
let mut i = 0;
|
||||
while i != len - (len & 3) {
|
||||
let mut k: u32 = key[i] as u32;
|
||||
k |= (key[i + 1] as u32) << 8;
|
||||
k |= (key[i + 2] as u32) << 16;
|
||||
k |= (key[i + 3] as u32) << 24;
|
||||
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> R;
|
||||
k = k.wrapping_mul(M);
|
||||
|
||||
h ^= k;
|
||||
h = h.wrapping_mul(M);
|
||||
|
||||
i += 4;
|
||||
}
|
||||
|
||||
let overflow = len & 3;
|
||||
if overflow >= 3 {
|
||||
h ^= (key[i + 2] as u32) << 16;
|
||||
}
|
||||
if overflow >= 2 {
|
||||
h ^= (key[i + 1] as u32) << 8;
|
||||
}
|
||||
if overflow >= 1 {
|
||||
h ^= key[i] as u32;
|
||||
}
|
||||
if overflow > 0 {
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= h >> 15;
|
||||
h
|
||||
}
|
122
src/murmur/murmurhash64.rs
Normal file
122
src/murmur/murmurhash64.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
// Copyright (C) 2022 Lucas Schwiderski
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
//
|
||||
// Adapted from https://github.com/badboy/murmurhash64-rs
|
||||
|
||||
// 'M' and 'R' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
const M: u64 = 0xc6a4a7935bd1e995;
|
||||
// Multiplicative inverse of `M` under % 2^64
|
||||
const M_INVERSE: u64 = 0x5f7a0ea7e59b19bd;
|
||||
const R: u8 = 47;
|
||||
|
||||
pub fn hash(key: &[u8], seed: u64) -> u64 {
|
||||
let len = key.len();
|
||||
let mut h: u64 = seed ^ ((len as u64).wrapping_mul(M));
|
||||
|
||||
let endpos = len - (len & 7);
|
||||
let mut i = 0;
|
||||
while i != endpos {
|
||||
let mut k: u64;
|
||||
|
||||
k = key[i] as u64;
|
||||
k |= (key[i + 1] as u64) << 8;
|
||||
k |= (key[i + 2] as u64) << 16;
|
||||
k |= (key[i + 3] as u64) << 24;
|
||||
k |= (key[i + 4] as u64) << 32;
|
||||
k |= (key[i + 5] as u64) << 40;
|
||||
k |= (key[i + 6] as u64) << 48;
|
||||
k |= (key[i + 7] as u64) << 56;
|
||||
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> R;
|
||||
k = k.wrapping_mul(M);
|
||||
|
||||
h ^= k;
|
||||
h = h.wrapping_mul(M);
|
||||
|
||||
i += 8;
|
||||
}
|
||||
|
||||
let overflow = len & 7;
|
||||
if overflow == 7 {
|
||||
h ^= (key[i + 6] as u64) << 48;
|
||||
}
|
||||
if overflow >= 6 {
|
||||
h ^= (key[i + 5] as u64) << 40;
|
||||
}
|
||||
if overflow >= 5 {
|
||||
h ^= (key[i + 4] as u64) << 32;
|
||||
}
|
||||
if overflow >= 4 {
|
||||
h ^= (key[i + 3] as u64) << 24;
|
||||
}
|
||||
if overflow >= 3 {
|
||||
h ^= (key[i + 2] as u64) << 16;
|
||||
}
|
||||
if overflow >= 2 {
|
||||
h ^= (key[i + 1] as u64) << 8;
|
||||
}
|
||||
if overflow >= 1 {
|
||||
h ^= key[i] as u64;
|
||||
}
|
||||
if overflow > 0 {
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
|
||||
h ^= h >> R;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= h >> R;
|
||||
h
|
||||
}
|
||||
|
||||
pub fn hash_inverse(hash: u64, seed: u64) -> u64 {
|
||||
let mut h = hash;
|
||||
h ^= h >> R;
|
||||
h = h.wrapping_mul(M_INVERSE);
|
||||
h ^= h >> R;
|
||||
h = h.wrapping_mul(M_INVERSE);
|
||||
|
||||
let h_forward: u64 = seed ^ (M.wrapping_mul(8));
|
||||
let mut k: u64 = h ^ h_forward;
|
||||
|
||||
k = k.wrapping_mul(M_INVERSE);
|
||||
k ^= k >> R;
|
||||
k = k.wrapping_mul(M_INVERSE);
|
||||
|
||||
// let mut k = k.to_ne_bytes();
|
||||
// k.reverse();
|
||||
// u64::from_ne_bytes(k)
|
||||
k
|
||||
}
|
||||
|
||||
pub fn hash32(key: &[u8], seed: u32) -> u32 {
|
||||
let h = hash(key, seed as u64);
|
||||
(h >> 32) as u32
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash() {
|
||||
assert_eq!(0, hash("".as_bytes(), 0));
|
||||
assert_eq!(0xc26e8bc196329b0f, hash("".as_bytes(), 10));
|
||||
assert_eq!(0xa14e8dfa2cd117e2, hash("lua".as_bytes(), 0));
|
||||
assert_eq!(
|
||||
0x069A33456AAD3042,
|
||||
hash("twitch_intervention".as_bytes(), 0)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inverse() {}
|
Loading…
Add table
Reference in a new issue