From 75c459672cf462756c322dd87842737130c33a9f Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Thu, 3 Nov 2022 20:24:14 +0100 Subject: [PATCH] feat: Implement hash dictionary --- Cargo.lock | 116 +++++++++++++++++++++++++ Cargo.toml | 8 +- src/bin/cmd/dictionary.rs | 152 ++++++++++++++++++++++++++++++++ src/bin/dtmt.rs | 60 ++++++++++++- src/context.rs | 40 ++++++++- src/lib.rs | 3 + src/murmur/dictionary.rs | 154 +++++++++++++++++++++++++++++++++ src/murmur/mod.rs | 172 +++++++++++++++++++++++++++++++++++++ src/murmur/murmurhash32.rs | 64 ++++++++++++++ src/murmur/murmurhash64.rs | 122 ++++++++++++++++++++++++++ 10 files changed, 883 insertions(+), 8 deletions(-) create mode 100644 src/bin/cmd/dictionary.rs create mode 100644 src/murmur/dictionary.rs create mode 100644 src/murmur/mod.rs create mode 100644 src/murmur/murmurhash32.rs create mode 100644 src/murmur/murmurhash64.rs diff --git a/Cargo.lock b/Cargo.lock index 93ff37d..9ef70d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,6 +55,18 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bytes" version = "1.2.1" @@ -81,6 +93,7 @@ checksum = "335867764ed2de42325fafe6d18b8af74ba97ee0c590fa016f157535b42ab04b" dependencies = [ "atty", "bitflags", + "clap_derive", "clap_lex", "once_cell", "strsim", @@ -89,6 +102,19 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "clap_derive" +version = "4.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "clap_lex" version = "0.3.0" @@ -125,17 +151,45 @@ dependencies = [ "tracing-error", ] +[[package]] +name = "csv-async" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c19b33b32fd48f83388821bd8f534b59e1b1ffd5c6c83771d1b23abd3dac2685" +dependencies = [ + "bstr", + "cfg-if", + "csv-core", + "futures", + "itoa", + "ryu", + "serde", + "tokio", + "tokio-stream", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "dtmt" version = "0.1.0" dependencies = [ "clap", "color-eyre", + "csv-async", "futures", "futures-util", "glob", "nanorand", "pin-project-lite", + "serde", "tempfile", "tokio", "tokio-stream", @@ -264,6 +318,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -288,6 +348,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "itoa" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" + [[package]] name = "lazy_static" version = "1.4.0" @@ -416,6 +482,30 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" version = "1.0.47" @@ -482,6 +572,32 @@ version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "serde" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sharded-slab" version = "0.1.4" diff --git a/Cargo.toml b/Cargo.toml index 5abe55c..a75e31e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,15 +4,17 @@ version = "0.1.0" edition = "2021" [dependencies] -clap = { version = "4.0.15", features = ["color", "std", "cargo", "unicode"] } +clap = { version = "4.0.15", features = ["color", "derive", "std", "cargo", "unicode"] } color-eyre = "0.6.2" +csv-async = { version = "1.2.4", features = ["tokio", "serde"] } futures = "0.3.25" futures-util = "0.3.24" glob = "0.3.0" nanorand = "0.7.0" pin-project-lite = "0.2.9" -tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util"] } -tokio-stream = { version = "0.1.11", features = ["fs"] } +serde = { version = "1.0.147", features = ["derive"] } +tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util", "io-std"] } +tokio-stream = { version = "0.1.11", features = ["fs", "io-util"] } tracing = { version = "0.1.37", features = ["async-await"] } tracing-error = "0.2.0" tracing-subscriber = { version = "0.3.16", features = ["env-filter"] } diff --git a/src/bin/cmd/dictionary.rs b/src/bin/cmd/dictionary.rs new file mode 100644 index 0000000..2425f3e --- /dev/null +++ b/src/bin/cmd/dictionary.rs @@ -0,0 +1,152 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use color_eyre::eyre::{Context, Result}; +use color_eyre::{Help, SectionExt}; +use dtmt::lookup_hash; +use dtmt::murmur::HashGroup; +use tokio::fs::File; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::sync::RwLock; +use tokio_stream::wrappers::LinesStream; +use tokio_stream::StreamExt; + +pub(crate) fn command_definition() -> Command { + Command::new("dictionary") + .about("Manipulate a hash dictionary file.") + .subcommand( + Command::new("lookup") + .about("Lookup a hash in the dictionary") + .arg(Arg::new("hash").help("The hash to look up").required(true)) + .arg( + Arg::new("group") + .help( + "Check each group for a match. \ + If no group is specified, all groups are checked.", + ) + .short('g') + .long("group") + .action(ArgAction::Append) + .value_parser(value_parser!(HashGroup)), + ), + ) + .subcommand( + Command::new("add") + .about( + "Add strings to the dictionary. \ + Strings are read line by line from the given file.", + ) + .arg( + Arg::new("group") + .help("The dictionary group to put these strings in.") + .short('g') + .long("group") + .value_parser(value_parser!(HashGroup)) + .default_value("other"), + ) + .arg( + Arg::new("file") + .help("Path to a file to read strings from.") + .required(true) + .value_parser(value_parser!(PathBuf)), + ), + ) + .subcommand(Command::new("save").about( + "Save back the currently loaded dictionary, with hashes pre-computed.\ + Pre-computing hashes speeds up loading large dictionaries, as they would \ + otherwise need to be computed on the fly.", + )) +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(ctx: Arc>, matches: &ArgMatches) -> Result<()> { + match matches.subcommand() { + Some(("lookup", sub_matches)) => { + let hash = sub_matches + .get_one::("hash") + .expect("required argument not found"); + + let groups = sub_matches + .get_many::("group") + .unwrap_or_default(); + + for group in groups { + let value = lookup_hash(ctx.clone(), *hash, *group).await; + println!("{}", value); + } + + Ok(()) + } + Some(("add", sub_matches)) => { + let path = sub_matches + .get_one::("file") + .expect("required argument not found"); + let group = sub_matches + .get_one::("group") + .expect("required argument not found"); + + let r: BufReader> = if let Some(name) = path.file_name() && name == "-" { + let f = tokio::io::stdin(); + BufReader::new(Box::new(f)) + } else { + let f = File::open(&path).await?; + BufReader::new(Box::new(f)) + }; + + let lines: Vec<_> = LinesStream::new(r.lines()).collect().await; + { + let mut ctx = ctx.write().await; + for line in lines.into_iter() { + ctx.lookup.add(line?, *group); + } + } + + let out_path = matches + .get_one::("dictionary") + .expect("no default value for 'dictionary' parameter"); + let f = File::create(out_path) + .await + .wrap_err("Failed to open dictionary file") + .with_suggestion(|| { + format!( + "Make sure the parent directories of '{}' exist and are writable", + out_path.display() + ) + }) + .with_section(|| out_path.display().to_string().header("Path:"))?; + + ctx.read() + .await + .lookup + .to_csv(f) + .await + .wrap_err("Failed to write dictionary to disk") + } + Some(("save", _)) => { + let out_path = matches + .get_one::("dictionary") + .expect("no default value for 'dictionary' parameter"); + let f = File::create(out_path) + .await + .wrap_err("Failed to open dictionary file") + .with_suggestion(|| { + format!( + "Make sure the parent directories of '{}' exist and are writable", + out_path.display() + ) + }) + .with_section(|| out_path.display().to_string().header("Path:"))?; + + ctx.read() + .await + .lookup + .to_csv(f) + .await + .wrap_err("Failed to write dictionary to disk") + } + _ => unreachable!( + "clap is configured to require a subcommand, and they're all handled above" + ), + } +} diff --git a/src/bin/dtmt.rs b/src/bin/dtmt.rs index 5118c0b..b053c11 100644 --- a/src/bin/dtmt.rs +++ b/src/bin/dtmt.rs @@ -1,20 +1,25 @@ #![feature(io_error_more)] #![feature(let_chains)] +use std::path::PathBuf; use std::sync::Arc; +use clap::parser::ValueSource; +use clap::value_parser; use clap::{command, Arg, ArgAction}; -use color_eyre::eyre::Result; +use color_eyre::eyre::{Context, Result}; +use color_eyre::{Help, SectionExt}; +use tokio::fs::File; +use tokio::io::BufReader; use tokio::sync::RwLock; use tracing_error::ErrorLayer; use tracing_subscriber::prelude::*; use tracing_subscriber::EnvFilter; -use dtmt::Context; - mod cmd { pub mod build; pub mod bundle; + pub mod dictionary; pub mod murmur; pub mod new; mod util; @@ -38,8 +43,20 @@ async fn main() -> Result<()> { May be specified multiple times.", ), ) + .arg( + Arg::new("dictionary") + .help( + "Path to a dictionary file CSV format used to look up pre-computed murmur hashes.\ + \nWill default to `dictionary.csv` in the current directory.", + ) + .default_value("dictionary.csv") + .long("dict") + .global(true) + .value_parser(value_parser!(PathBuf)), + ) .subcommand(cmd::build::command_definition()) .subcommand(cmd::bundle::command_definition()) + .subcommand(cmd::dictionary::command_definition()) .subcommand(cmd::murmur::command_definition()) .subcommand(cmd::new::command_definition()) .subcommand(cmd::watch::command_definition()) @@ -57,15 +74,50 @@ async fn main() -> Result<()> { .init(); } - let ctx = Context::new(); + let ctx = dtmt::Context::new(); let ctx = Arc::new(RwLock::new(ctx)); + { + let path = matches + .get_one::("dictionary") + .cloned() + .expect("no default value for 'dictionary' parameter"); + let is_default = matches.value_source("dictionary") == Some(ValueSource::DefaultValue); + let ctx = ctx.clone(); + + tokio::spawn(async move { + let mut ctx = ctx.write().await; + let res = File::open(&path) + .await + .wrap_err("Failed to open dictionary file") + .with_section(|| path.display().to_string().header("Path:")); + + let f = match res { + Ok(f) => f, + Err(err) => { + if is_default { + return; + } + tracing::error!("{}", err); + + return; + } + }; + + let r = BufReader::new(f); + if let Err(err) = ctx.lookup.from_csv(r).await { + tracing::error!("{}", err); + } + }); + } + match matches.subcommand() { Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?, Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?, Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?, Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?, Some(("watch", sub_matches)) => cmd::watch::run(ctx, sub_matches).await?, + Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above" ), diff --git a/src/context.rs b/src/context.rs index 79c73fd..0d0ff41 100644 --- a/src/context.rs +++ b/src/context.rs @@ -1,10 +1,20 @@ +use std::sync::Arc; + +use tokio::sync::RwLock; + +use crate::murmur::{Dictionary, HashGroup, Murmur32, Murmur64}; + pub struct Context { + pub lookup: Dictionary, pub oodle: Option, } impl Context { pub fn new() -> Self { - Self { oodle: None } + Self { + lookup: Dictionary::new(), + oodle: None, + } } } @@ -13,3 +23,31 @@ impl Default for Context { Self::new() } } + +pub async fn lookup_hash(ctx: Arc>, hash: M, group: HashGroup) -> String +where + M: Into, +{ + let hash = hash.into(); + if let Some(s) = ctx.read().await.lookup.lookup(hash, group) { + tracing::debug!(%hash, string = s, "Murmur64 lookup successful"); + s.to_owned() + } else { + tracing::debug!(%hash, "Murmur64 lookup failed"); + format!("{:016X}", hash) + } +} + +pub async fn lookup_hash_short(ctx: Arc>, hash: M, group: HashGroup) -> String +where + M: Into, +{ + let hash = hash.into(); + if let Some(s) = ctx.read().await.lookup.lookup_short(hash, group) { + tracing::debug!(%hash, string = s, "Murmur32 lookup successful"); + s.to_owned() + } else { + tracing::debug!(%hash, "Murmur32 lookup failed"); + format!("{:08X}", hash) + } +} diff --git a/src/lib.rs b/src/lib.rs index a25ce0c..caef980 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,9 @@ mod bundle; mod context; +pub mod murmur; mod oodle; pub use bundle::decompress; +pub use context::lookup_hash; +pub use context::lookup_hash_short; pub use context::Context; diff --git a/src/murmur/dictionary.rs b/src/murmur/dictionary.rs new file mode 100644 index 0000000..2580f32 --- /dev/null +++ b/src/murmur/dictionary.rs @@ -0,0 +1,154 @@ +use clap::ValueEnum; +use color_eyre::{eyre::Context, Help, Result, SectionExt}; +use csv_async::{AsyncDeserializer, AsyncSerializer}; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_stream::StreamExt; + +use super::{murmurhash64, Murmur32, Murmur64, SEED}; + +#[derive(Copy, Clone, Deserialize, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum HashGroup { + Filename, + Filetype, + Other, +} + +impl std::fmt::Display for HashGroup { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HashGroup::Filename => write!(f, "filename"), + HashGroup::Filetype => write!(f, "filetype"), + HashGroup::Other => write!(f, "other"), + } + } +} + +impl Default for HashGroup { + fn default() -> Self { + Self::Other + } +} + +#[derive(Deserialize, Serialize)] +struct Row { + // NOTE: The order of fields is important, as the CSV serializer copies that. + value: String, + long: Option, + short: Option, + #[serde(default)] + group: HashGroup, +} + +struct Entry { + value: String, + long: Murmur64, + short: Murmur32, + group: HashGroup, +} + +pub struct Dictionary { + entries: Vec, +} + +impl Default for Dictionary { + fn default() -> Self { + Self::new() + } +} + +impl Dictionary { + pub fn new() -> Self { + Self { entries: vec![] } + } + + pub async fn from_csv(&mut self, r: R) -> Result<()> + where + R: AsyncRead + std::marker::Unpin + std::marker::Send, + { + let r = AsyncDeserializer::from_reader(r); + let mut records = r.into_deserialize::(); + + while let Some(row) = records.next().await { + let record = row?; + let value = record.value; + let long = record.long.unwrap_or_else(|| { + Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64)) + }); + let short = record + .short + .unwrap_or_else(|| Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED))); + + let entry = Entry { + value, + long, + short, + group: record.group, + }; + + self.entries.push(entry); + } + + Ok(()) + } + + pub async fn to_csv(&self, w: W) -> Result<()> + where + W: AsyncWrite + std::marker::Unpin, + { + let mut w = AsyncSerializer::from_writer(w); + for (i, entry) in self.entries.iter().enumerate() { + let row = Row { + long: Some(entry.long), + short: Some(entry.short), + value: entry.value.clone(), + group: entry.group, + }; + + w.serialize(row) + .await + .wrap_err("Failed to serialize entry") + .with_section(|| { + let s = format!( + "Index: {}\nValue: {}\n64bit: {:#016X}\n32bit: {:#08X}\nGroup: {}", + i, entry.value, entry.long, entry.short, entry.group + ); + + s.header("Entry") + })?; + } + + Ok(()) + } + + pub fn add(&mut self, value: String, group: HashGroup) { + let long = Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64)); + let short = Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED)); + + let entry = Entry { + long, + short, + value, + group, + }; + + self.entries.push(entry); + } + + pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> { + self.entries + .iter() + .filter(|e| e.group == group) + .find(|e| e.long == hash) + .map(|e| &e.value) + } + + pub fn lookup_short(&self, hash: Murmur32, group: HashGroup) -> Option<&String> { + self.entries + .iter() + .filter(|e| e.group == group) + .find(|e| e.short == hash) + .map(|e| &e.value) + } +} diff --git a/src/murmur/mod.rs b/src/murmur/mod.rs new file mode 100644 index 0000000..05a12ce --- /dev/null +++ b/src/murmur/mod.rs @@ -0,0 +1,172 @@ +use std::fmt; +use std::num::ParseIntError; +use std::ops::Deref; + +use serde::de::Visitor; +use serde::{Deserialize, Serialize}; +use serde::{Deserializer, Serializer}; + +mod dictionary; +// Currently unused +// mod murmurhash32; +mod murmurhash64; + +pub const SEED: u32 = 0; + +pub use dictionary::Dictionary; +pub use dictionary::HashGroup; +pub use murmurhash64::hash; +pub use murmurhash64::hash32; +pub use murmurhash64::hash_inverse as inverse; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Murmur64(u64); + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Murmur32(u32); + +impl Deref for Murmur64 { + type Target = u64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for Murmur64 { + fn from(value: u64) -> Self { + Self(value) + } +} + +impl TryFrom<&str> for Murmur64 { + type Error = ParseIntError; + + fn try_from(value: &str) -> Result { + u64::from_str_radix(value, 16).map(Self) + } +} + +impl fmt::UpperHex for Murmur64 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::UpperHex::fmt(&self.0, f) + } +} + +impl fmt::Display for Murmur64 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::UpperHex::fmt(&self.0, f) + } +} + +impl<'de> Visitor<'de> for Murmur64 { + type Value = Self; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str( + "an usinged 64 bit integer \ + or a string in hexadecimal format encoding such an integer", + ) + } + + fn visit_u64(self, value: u64) -> Result { + Ok(Self::from(value)) + } + + fn visit_str(self, value: &str) -> Result + where + E: serde::de::Error, + { + match Murmur64::try_from(value) { + Ok(hash) => Ok(hash), + Err(err) => Err(E::custom(err)), + } + } +} + +impl<'de> Deserialize<'de> for Murmur64 { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(Self(0)) + } +} + +impl Serialize for Murmur64 { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&format!("{:016X}", self)) + } +} + +impl From for Murmur32 { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl TryFrom<&str> for Murmur32 { + type Error = ParseIntError; + + fn try_from(value: &str) -> Result { + u32::from_str_radix(value, 8).map(Self) + } +} + +impl fmt::UpperHex for Murmur32 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::UpperHex::fmt(&self.0, f) + } +} + +impl fmt::Display for Murmur32 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::UpperHex::fmt(&self.0, f) + } +} + +impl Serialize for Murmur32 { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&format!("{:08X}", self)) + } +} + +impl<'de> Visitor<'de> for Murmur32 { + type Value = Self; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str( + "an usinged 32 bit integer \ + or a string in hexadecimal format encoding such an integer", + ) + } + + fn visit_u32(self, value: u32) -> Result { + Ok(Self::from(value)) + } + + fn visit_str(self, value: &str) -> Result + where + E: serde::de::Error, + { + match Murmur32::try_from(value) { + Ok(hash) => Ok(hash), + Err(err) => Err(E::custom(err)), + } + } +} + +impl<'de> Deserialize<'de> for Murmur32 { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(Self(0)) + } +} diff --git a/src/murmur/murmurhash32.rs b/src/murmur/murmurhash32.rs new file mode 100644 index 0000000..fd47aca --- /dev/null +++ b/src/murmur/murmurhash32.rs @@ -0,0 +1,64 @@ +// Copyright (C) 2022 Lucas Schwiderski +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// Adapted from https://github.com/badboy/murmurhash64-rs + +// 'M' and 'R' are mixing constants generated offline. +// They're not really 'magic', they just happen to work well. +const M: u32 = 0x5bd1e995; +const R: u8 = 24; + +pub fn hash(key: &[u8], seed: u32) -> u32 { + let len = key.len(); + + // Initialize hash to a "random" value + let mut h: u32 = seed ^ ((len as u32).wrapping_mul(M)); + + let mut i = 0; + while i != len - (len & 3) { + let mut k: u32 = key[i] as u32; + k |= (key[i + 1] as u32) << 8; + k |= (key[i + 2] as u32) << 16; + k |= (key[i + 3] as u32) << 24; + + k = k.wrapping_mul(M); + k ^= k >> R; + k = k.wrapping_mul(M); + + h ^= k; + h = h.wrapping_mul(M); + + i += 4; + } + + let overflow = len & 3; + if overflow >= 3 { + h ^= (key[i + 2] as u32) << 16; + } + if overflow >= 2 { + h ^= (key[i + 1] as u32) << 8; + } + if overflow >= 1 { + h ^= key[i] as u32; + } + if overflow > 0 { + h = h.wrapping_mul(M); + } + + h ^= h >> 13; + h = h.wrapping_mul(M); + h ^= h >> 15; + h +} diff --git a/src/murmur/murmurhash64.rs b/src/murmur/murmurhash64.rs new file mode 100644 index 0000000..f15248c --- /dev/null +++ b/src/murmur/murmurhash64.rs @@ -0,0 +1,122 @@ +// Copyright (C) 2022 Lucas Schwiderski +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// Adapted from https://github.com/badboy/murmurhash64-rs + +// 'M' and 'R' are mixing constants generated offline. +// They're not really 'magic', they just happen to work well. +const M: u64 = 0xc6a4a7935bd1e995; +// Multiplicative inverse of `M` under % 2^64 +const M_INVERSE: u64 = 0x5f7a0ea7e59b19bd; +const R: u8 = 47; + +pub fn hash(key: &[u8], seed: u64) -> u64 { + let len = key.len(); + let mut h: u64 = seed ^ ((len as u64).wrapping_mul(M)); + + let endpos = len - (len & 7); + let mut i = 0; + while i != endpos { + let mut k: u64; + + k = key[i] as u64; + k |= (key[i + 1] as u64) << 8; + k |= (key[i + 2] as u64) << 16; + k |= (key[i + 3] as u64) << 24; + k |= (key[i + 4] as u64) << 32; + k |= (key[i + 5] as u64) << 40; + k |= (key[i + 6] as u64) << 48; + k |= (key[i + 7] as u64) << 56; + + k = k.wrapping_mul(M); + k ^= k >> R; + k = k.wrapping_mul(M); + + h ^= k; + h = h.wrapping_mul(M); + + i += 8; + } + + let overflow = len & 7; + if overflow == 7 { + h ^= (key[i + 6] as u64) << 48; + } + if overflow >= 6 { + h ^= (key[i + 5] as u64) << 40; + } + if overflow >= 5 { + h ^= (key[i + 4] as u64) << 32; + } + if overflow >= 4 { + h ^= (key[i + 3] as u64) << 24; + } + if overflow >= 3 { + h ^= (key[i + 2] as u64) << 16; + } + if overflow >= 2 { + h ^= (key[i + 1] as u64) << 8; + } + if overflow >= 1 { + h ^= key[i] as u64; + } + if overflow > 0 { + h = h.wrapping_mul(M); + } + + h ^= h >> R; + h = h.wrapping_mul(M); + h ^= h >> R; + h +} + +pub fn hash_inverse(hash: u64, seed: u64) -> u64 { + let mut h = hash; + h ^= h >> R; + h = h.wrapping_mul(M_INVERSE); + h ^= h >> R; + h = h.wrapping_mul(M_INVERSE); + + let h_forward: u64 = seed ^ (M.wrapping_mul(8)); + let mut k: u64 = h ^ h_forward; + + k = k.wrapping_mul(M_INVERSE); + k ^= k >> R; + k = k.wrapping_mul(M_INVERSE); + + // let mut k = k.to_ne_bytes(); + // k.reverse(); + // u64::from_ne_bytes(k) + k +} + +pub fn hash32(key: &[u8], seed: u32) -> u32 { + let h = hash(key, seed as u64); + (h >> 32) as u32 +} + +#[test] +fn test_hash() { + assert_eq!(0, hash("".as_bytes(), 0)); + assert_eq!(0xc26e8bc196329b0f, hash("".as_bytes(), 10)); + assert_eq!(0xa14e8dfa2cd117e2, hash("lua".as_bytes(), 0)); + assert_eq!( + 0x069A33456AAD3042, + hash("twitch_intervention".as_bytes(), 0) + ); +} + +#[test] +fn test_inverse() {}