feat: Implement hash dictionary

This commit is contained in:
Lucas Schwiderski 2022-11-03 20:24:14 +01:00
parent cf2503214b
commit 75c459672c
Signed by: lucas
GPG key ID: AA12679AAA6DF4D8
10 changed files with 883 additions and 8 deletions

116
Cargo.lock generated
View file

@ -55,6 +55,18 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bstr"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]]
name = "bytes"
version = "1.2.1"
@ -81,6 +93,7 @@ checksum = "335867764ed2de42325fafe6d18b8af74ba97ee0c590fa016f157535b42ab04b"
dependencies = [
"atty",
"bitflags",
"clap_derive",
"clap_lex",
"once_cell",
"strsim",
@ -89,6 +102,19 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "clap_derive"
version = "4.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.3.0"
@ -125,17 +151,45 @@ dependencies = [
"tracing-error",
]
[[package]]
name = "csv-async"
version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c19b33b32fd48f83388821bd8f534b59e1b1ffd5c6c83771d1b23abd3dac2685"
dependencies = [
"bstr",
"cfg-if",
"csv-core",
"futures",
"itoa",
"ryu",
"serde",
"tokio",
"tokio-stream",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "dtmt"
version = "0.1.0"
dependencies = [
"clap",
"color-eyre",
"csv-async",
"futures",
"futures-util",
"glob",
"nanorand",
"pin-project-lite",
"serde",
"tempfile",
"tokio",
"tokio-stream",
@ -264,6 +318,12 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]]
name = "heck"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
[[package]]
name = "hermit-abi"
version = "0.1.19"
@ -288,6 +348,12 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "itoa"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc"
[[package]]
name = "lazy_static"
version = "1.4.0"
@ -416,6 +482,30 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]]
name = "proc-macro2"
version = "1.0.47"
@ -482,6 +572,32 @@ version = "0.1.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
[[package]]
name = "ryu"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
[[package]]
name = "serde"
version = "1.0.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "sharded-slab"
version = "0.1.4"

View file

@ -4,15 +4,17 @@ version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.0.15", features = ["color", "std", "cargo", "unicode"] }
clap = { version = "4.0.15", features = ["color", "derive", "std", "cargo", "unicode"] }
color-eyre = "0.6.2"
csv-async = { version = "1.2.4", features = ["tokio", "serde"] }
futures = "0.3.25"
futures-util = "0.3.24"
glob = "0.3.0"
nanorand = "0.7.0"
pin-project-lite = "0.2.9"
tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util"] }
tokio-stream = { version = "0.1.11", features = ["fs"] }
serde = { version = "1.0.147", features = ["derive"] }
tokio = { version = "1.21.2", features = ["rt-multi-thread", "fs", "process", "macros", "tracing", "io-util", "io-std"] }
tokio-stream = { version = "0.1.11", features = ["fs", "io-util"] }
tracing = { version = "0.1.37", features = ["async-await"] }
tracing-error = "0.2.0"
tracing-subscriber = { version = "0.3.16", features = ["env-filter"] }

152
src/bin/cmd/dictionary.rs Normal file
View file

@ -0,0 +1,152 @@
use std::path::PathBuf;
use std::sync::Arc;
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use color_eyre::eyre::{Context, Result};
use color_eyre::{Help, SectionExt};
use dtmt::lookup_hash;
use dtmt::murmur::HashGroup;
use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::sync::RwLock;
use tokio_stream::wrappers::LinesStream;
use tokio_stream::StreamExt;
pub(crate) fn command_definition() -> Command {
Command::new("dictionary")
.about("Manipulate a hash dictionary file.")
.subcommand(
Command::new("lookup")
.about("Lookup a hash in the dictionary")
.arg(Arg::new("hash").help("The hash to look up").required(true))
.arg(
Arg::new("group")
.help(
"Check each group for a match. \
If no group is specified, all groups are checked.",
)
.short('g')
.long("group")
.action(ArgAction::Append)
.value_parser(value_parser!(HashGroup)),
),
)
.subcommand(
Command::new("add")
.about(
"Add strings to the dictionary. \
Strings are read line by line from the given file.",
)
.arg(
Arg::new("group")
.help("The dictionary group to put these strings in.")
.short('g')
.long("group")
.value_parser(value_parser!(HashGroup))
.default_value("other"),
)
.arg(
Arg::new("file")
.help("Path to a file to read strings from.")
.required(true)
.value_parser(value_parser!(PathBuf)),
),
)
.subcommand(Command::new("save").about(
"Save back the currently loaded dictionary, with hashes pre-computed.\
Pre-computing hashes speeds up loading large dictionaries, as they would \
otherwise need to be computed on the fly.",
))
}
#[tracing::instrument(skip_all)]
pub(crate) async fn run(ctx: Arc<RwLock<dtmt::Context>>, matches: &ArgMatches) -> Result<()> {
match matches.subcommand() {
Some(("lookup", sub_matches)) => {
let hash = sub_matches
.get_one::<u64>("hash")
.expect("required argument not found");
let groups = sub_matches
.get_many::<HashGroup>("group")
.unwrap_or_default();
for group in groups {
let value = lookup_hash(ctx.clone(), *hash, *group).await;
println!("{}", value);
}
Ok(())
}
Some(("add", sub_matches)) => {
let path = sub_matches
.get_one::<PathBuf>("file")
.expect("required argument not found");
let group = sub_matches
.get_one::<HashGroup>("group")
.expect("required argument not found");
let r: BufReader<Box<dyn tokio::io::AsyncRead + std::marker::Unpin>> = if let Some(name) = path.file_name() && name == "-" {
let f = tokio::io::stdin();
BufReader::new(Box::new(f))
} else {
let f = File::open(&path).await?;
BufReader::new(Box::new(f))
};
let lines: Vec<_> = LinesStream::new(r.lines()).collect().await;
{
let mut ctx = ctx.write().await;
for line in lines.into_iter() {
ctx.lookup.add(line?, *group);
}
}
let out_path = matches
.get_one::<PathBuf>("dictionary")
.expect("no default value for 'dictionary' parameter");
let f = File::create(out_path)
.await
.wrap_err("Failed to open dictionary file")
.with_suggestion(|| {
format!(
"Make sure the parent directories of '{}' exist and are writable",
out_path.display()
)
})
.with_section(|| out_path.display().to_string().header("Path:"))?;
ctx.read()
.await
.lookup
.to_csv(f)
.await
.wrap_err("Failed to write dictionary to disk")
}
Some(("save", _)) => {
let out_path = matches
.get_one::<PathBuf>("dictionary")
.expect("no default value for 'dictionary' parameter");
let f = File::create(out_path)
.await
.wrap_err("Failed to open dictionary file")
.with_suggestion(|| {
format!(
"Make sure the parent directories of '{}' exist and are writable",
out_path.display()
)
})
.with_section(|| out_path.display().to_string().header("Path:"))?;
ctx.read()
.await
.lookup
.to_csv(f)
.await
.wrap_err("Failed to write dictionary to disk")
}
_ => unreachable!(
"clap is configured to require a subcommand, and they're all handled above"
),
}
}

View file

@ -1,20 +1,25 @@
#![feature(io_error_more)]
#![feature(let_chains)]
use std::path::PathBuf;
use std::sync::Arc;
use clap::parser::ValueSource;
use clap::value_parser;
use clap::{command, Arg, ArgAction};
use color_eyre::eyre::Result;
use color_eyre::eyre::{Context, Result};
use color_eyre::{Help, SectionExt};
use tokio::fs::File;
use tokio::io::BufReader;
use tokio::sync::RwLock;
use tracing_error::ErrorLayer;
use tracing_subscriber::prelude::*;
use tracing_subscriber::EnvFilter;
use dtmt::Context;
mod cmd {
pub mod build;
pub mod bundle;
pub mod dictionary;
pub mod murmur;
pub mod new;
mod util;
@ -38,8 +43,20 @@ async fn main() -> Result<()> {
May be specified multiple times.",
),
)
.arg(
Arg::new("dictionary")
.help(
"Path to a dictionary file CSV format used to look up pre-computed murmur hashes.\
\nWill default to `dictionary.csv` in the current directory.",
)
.default_value("dictionary.csv")
.long("dict")
.global(true)
.value_parser(value_parser!(PathBuf)),
)
.subcommand(cmd::build::command_definition())
.subcommand(cmd::bundle::command_definition())
.subcommand(cmd::dictionary::command_definition())
.subcommand(cmd::murmur::command_definition())
.subcommand(cmd::new::command_definition())
.subcommand(cmd::watch::command_definition())
@ -57,15 +74,50 @@ async fn main() -> Result<()> {
.init();
}
let ctx = Context::new();
let ctx = dtmt::Context::new();
let ctx = Arc::new(RwLock::new(ctx));
{
let path = matches
.get_one::<PathBuf>("dictionary")
.cloned()
.expect("no default value for 'dictionary' parameter");
let is_default = matches.value_source("dictionary") == Some(ValueSource::DefaultValue);
let ctx = ctx.clone();
tokio::spawn(async move {
let mut ctx = ctx.write().await;
let res = File::open(&path)
.await
.wrap_err("Failed to open dictionary file")
.with_section(|| path.display().to_string().header("Path:"));
let f = match res {
Ok(f) => f,
Err(err) => {
if is_default {
return;
}
tracing::error!("{}", err);
return;
}
};
let r = BufReader::new(f);
if let Err(err) = ctx.lookup.from_csv(r).await {
tracing::error!("{}", err);
}
});
}
match matches.subcommand() {
Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?,
Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?,
Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,
Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?,
Some(("watch", sub_matches)) => cmd::watch::run(ctx, sub_matches).await?,
Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?,
_ => unreachable!(
"clap is configured to require a subcommand, and they're all handled above"
),

View file

@ -1,10 +1,20 @@
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::murmur::{Dictionary, HashGroup, Murmur32, Murmur64};
pub struct Context {
pub lookup: Dictionary,
pub oodle: Option<String>,
}
impl Context {
pub fn new() -> Self {
Self { oodle: None }
Self {
lookup: Dictionary::new(),
oodle: None,
}
}
}
@ -13,3 +23,31 @@ impl Default for Context {
Self::new()
}
}
pub async fn lookup_hash<M>(ctx: Arc<RwLock<Context>>, hash: M, group: HashGroup) -> String
where
M: Into<Murmur64>,
{
let hash = hash.into();
if let Some(s) = ctx.read().await.lookup.lookup(hash, group) {
tracing::debug!(%hash, string = s, "Murmur64 lookup successful");
s.to_owned()
} else {
tracing::debug!(%hash, "Murmur64 lookup failed");
format!("{:016X}", hash)
}
}
pub async fn lookup_hash_short<M>(ctx: Arc<RwLock<Context>>, hash: M, group: HashGroup) -> String
where
M: Into<Murmur32>,
{
let hash = hash.into();
if let Some(s) = ctx.read().await.lookup.lookup_short(hash, group) {
tracing::debug!(%hash, string = s, "Murmur32 lookup successful");
s.to_owned()
} else {
tracing::debug!(%hash, "Murmur32 lookup failed");
format!("{:08X}", hash)
}
}

View file

@ -1,6 +1,9 @@
mod bundle;
mod context;
pub mod murmur;
mod oodle;
pub use bundle::decompress;
pub use context::lookup_hash;
pub use context::lookup_hash_short;
pub use context::Context;

154
src/murmur/dictionary.rs Normal file
View file

@ -0,0 +1,154 @@
use clap::ValueEnum;
use color_eyre::{eyre::Context, Help, Result, SectionExt};
use csv_async::{AsyncDeserializer, AsyncSerializer};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_stream::StreamExt;
use super::{murmurhash64, Murmur32, Murmur64, SEED};
#[derive(Copy, Clone, Deserialize, PartialEq, Serialize, ValueEnum)]
#[serde(rename_all = "snake_case")]
pub enum HashGroup {
Filename,
Filetype,
Other,
}
impl std::fmt::Display for HashGroup {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
HashGroup::Filename => write!(f, "filename"),
HashGroup::Filetype => write!(f, "filetype"),
HashGroup::Other => write!(f, "other"),
}
}
}
impl Default for HashGroup {
fn default() -> Self {
Self::Other
}
}
#[derive(Deserialize, Serialize)]
struct Row {
// NOTE: The order of fields is important, as the CSV serializer copies that.
value: String,
long: Option<Murmur64>,
short: Option<Murmur32>,
#[serde(default)]
group: HashGroup,
}
struct Entry {
value: String,
long: Murmur64,
short: Murmur32,
group: HashGroup,
}
pub struct Dictionary {
entries: Vec<Entry>,
}
impl Default for Dictionary {
fn default() -> Self {
Self::new()
}
}
impl Dictionary {
pub fn new() -> Self {
Self { entries: vec![] }
}
pub async fn from_csv<R>(&mut self, r: R) -> Result<()>
where
R: AsyncRead + std::marker::Unpin + std::marker::Send,
{
let r = AsyncDeserializer::from_reader(r);
let mut records = r.into_deserialize::<Row>();
while let Some(row) = records.next().await {
let record = row?;
let value = record.value;
let long = record.long.unwrap_or_else(|| {
Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64))
});
let short = record
.short
.unwrap_or_else(|| Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED)));
let entry = Entry {
value,
long,
short,
group: record.group,
};
self.entries.push(entry);
}
Ok(())
}
pub async fn to_csv<W>(&self, w: W) -> Result<()>
where
W: AsyncWrite + std::marker::Unpin,
{
let mut w = AsyncSerializer::from_writer(w);
for (i, entry) in self.entries.iter().enumerate() {
let row = Row {
long: Some(entry.long),
short: Some(entry.short),
value: entry.value.clone(),
group: entry.group,
};
w.serialize(row)
.await
.wrap_err("Failed to serialize entry")
.with_section(|| {
let s = format!(
"Index: {}\nValue: {}\n64bit: {:#016X}\n32bit: {:#08X}\nGroup: {}",
i, entry.value, entry.long, entry.short, entry.group
);
s.header("Entry")
})?;
}
Ok(())
}
pub fn add(&mut self, value: String, group: HashGroup) {
let long = Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64));
let short = Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED));
let entry = Entry {
long,
short,
value,
group,
};
self.entries.push(entry);
}
pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.long == hash)
.map(|e| &e.value)
}
pub fn lookup_short(&self, hash: Murmur32, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.short == hash)
.map(|e| &e.value)
}
}

172
src/murmur/mod.rs Normal file
View file

@ -0,0 +1,172 @@
use std::fmt;
use std::num::ParseIntError;
use std::ops::Deref;
use serde::de::Visitor;
use serde::{Deserialize, Serialize};
use serde::{Deserializer, Serializer};
mod dictionary;
// Currently unused
// mod murmurhash32;
mod murmurhash64;
pub const SEED: u32 = 0;
pub use dictionary::Dictionary;
pub use dictionary::HashGroup;
pub use murmurhash64::hash;
pub use murmurhash64::hash32;
pub use murmurhash64::hash_inverse as inverse;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct Murmur64(u64);
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct Murmur32(u32);
impl Deref for Murmur64 {
type Target = u64;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl From<u64> for Murmur64 {
fn from(value: u64) -> Self {
Self(value)
}
}
impl TryFrom<&str> for Murmur64 {
type Error = ParseIntError;
fn try_from(value: &str) -> Result<Self, Self::Error> {
u64::from_str_radix(value, 16).map(Self)
}
}
impl fmt::UpperHex for Murmur64 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::UpperHex::fmt(&self.0, f)
}
}
impl fmt::Display for Murmur64 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::UpperHex::fmt(&self.0, f)
}
}
impl<'de> Visitor<'de> for Murmur64 {
type Value = Self;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str(
"an usinged 64 bit integer \
or a string in hexadecimal format encoding such an integer",
)
}
fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E> {
Ok(Self::from(value))
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
match Murmur64::try_from(value) {
Ok(hash) => Ok(hash),
Err(err) => Err(E::custom(err)),
}
}
}
impl<'de> Deserialize<'de> for Murmur64 {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_any(Self(0))
}
}
impl Serialize for Murmur64 {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&format!("{:016X}", self))
}
}
impl From<u32> for Murmur32 {
fn from(value: u32) -> Self {
Self(value)
}
}
impl TryFrom<&str> for Murmur32 {
type Error = ParseIntError;
fn try_from(value: &str) -> Result<Self, Self::Error> {
u32::from_str_radix(value, 8).map(Self)
}
}
impl fmt::UpperHex for Murmur32 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::UpperHex::fmt(&self.0, f)
}
}
impl fmt::Display for Murmur32 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::UpperHex::fmt(&self.0, f)
}
}
impl Serialize for Murmur32 {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&format!("{:08X}", self))
}
}
impl<'de> Visitor<'de> for Murmur32 {
type Value = Self;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str(
"an usinged 32 bit integer \
or a string in hexadecimal format encoding such an integer",
)
}
fn visit_u32<E>(self, value: u32) -> Result<Self::Value, E> {
Ok(Self::from(value))
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
match Murmur32::try_from(value) {
Ok(hash) => Ok(hash),
Err(err) => Err(E::custom(err)),
}
}
}
impl<'de> Deserialize<'de> for Murmur32 {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_any(Self(0))
}
}

View file

@ -0,0 +1,64 @@
// Copyright (C) 2022 Lucas Schwiderski
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// Adapted from https://github.com/badboy/murmurhash64-rs
// 'M' and 'R' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const M: u32 = 0x5bd1e995;
const R: u8 = 24;
pub fn hash(key: &[u8], seed: u32) -> u32 {
let len = key.len();
// Initialize hash to a "random" value
let mut h: u32 = seed ^ ((len as u32).wrapping_mul(M));
let mut i = 0;
while i != len - (len & 3) {
let mut k: u32 = key[i] as u32;
k |= (key[i + 1] as u32) << 8;
k |= (key[i + 2] as u32) << 16;
k |= (key[i + 3] as u32) << 24;
k = k.wrapping_mul(M);
k ^= k >> R;
k = k.wrapping_mul(M);
h ^= k;
h = h.wrapping_mul(M);
i += 4;
}
let overflow = len & 3;
if overflow >= 3 {
h ^= (key[i + 2] as u32) << 16;
}
if overflow >= 2 {
h ^= (key[i + 1] as u32) << 8;
}
if overflow >= 1 {
h ^= key[i] as u32;
}
if overflow > 0 {
h = h.wrapping_mul(M);
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^= h >> 15;
h
}

122
src/murmur/murmurhash64.rs Normal file
View file

@ -0,0 +1,122 @@
// Copyright (C) 2022 Lucas Schwiderski
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// Adapted from https://github.com/badboy/murmurhash64-rs
// 'M' and 'R' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const M: u64 = 0xc6a4a7935bd1e995;
// Multiplicative inverse of `M` under % 2^64
const M_INVERSE: u64 = 0x5f7a0ea7e59b19bd;
const R: u8 = 47;
pub fn hash(key: &[u8], seed: u64) -> u64 {
let len = key.len();
let mut h: u64 = seed ^ ((len as u64).wrapping_mul(M));
let endpos = len - (len & 7);
let mut i = 0;
while i != endpos {
let mut k: u64;
k = key[i] as u64;
k |= (key[i + 1] as u64) << 8;
k |= (key[i + 2] as u64) << 16;
k |= (key[i + 3] as u64) << 24;
k |= (key[i + 4] as u64) << 32;
k |= (key[i + 5] as u64) << 40;
k |= (key[i + 6] as u64) << 48;
k |= (key[i + 7] as u64) << 56;
k = k.wrapping_mul(M);
k ^= k >> R;
k = k.wrapping_mul(M);
h ^= k;
h = h.wrapping_mul(M);
i += 8;
}
let overflow = len & 7;
if overflow == 7 {
h ^= (key[i + 6] as u64) << 48;
}
if overflow >= 6 {
h ^= (key[i + 5] as u64) << 40;
}
if overflow >= 5 {
h ^= (key[i + 4] as u64) << 32;
}
if overflow >= 4 {
h ^= (key[i + 3] as u64) << 24;
}
if overflow >= 3 {
h ^= (key[i + 2] as u64) << 16;
}
if overflow >= 2 {
h ^= (key[i + 1] as u64) << 8;
}
if overflow >= 1 {
h ^= key[i] as u64;
}
if overflow > 0 {
h = h.wrapping_mul(M);
}
h ^= h >> R;
h = h.wrapping_mul(M);
h ^= h >> R;
h
}
pub fn hash_inverse(hash: u64, seed: u64) -> u64 {
let mut h = hash;
h ^= h >> R;
h = h.wrapping_mul(M_INVERSE);
h ^= h >> R;
h = h.wrapping_mul(M_INVERSE);
let h_forward: u64 = seed ^ (M.wrapping_mul(8));
let mut k: u64 = h ^ h_forward;
k = k.wrapping_mul(M_INVERSE);
k ^= k >> R;
k = k.wrapping_mul(M_INVERSE);
// let mut k = k.to_ne_bytes();
// k.reverse();
// u64::from_ne_bytes(k)
k
}
pub fn hash32(key: &[u8], seed: u32) -> u32 {
let h = hash(key, seed as u64);
(h >> 32) as u32
}
#[test]
fn test_hash() {
assert_eq!(0, hash("".as_bytes(), 0));
assert_eq!(0xc26e8bc196329b0f, hash("".as_bytes(), 10));
assert_eq!(0xa14e8dfa2cd117e2, hash("lua".as_bytes(), 0));
assert_eq!(
0x069A33456AAD3042,
hash("twitch_intervention".as_bytes(), 0)
);
}
#[test]
fn test_inverse() {}