dtmt/src/murmur/dictionary.rs

154 lines
4 KiB
Rust

use clap::ValueEnum;
use color_eyre::{eyre::Context, Help, Result, SectionExt};
use csv_async::{AsyncDeserializer, AsyncSerializer};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_stream::StreamExt;
use super::{murmurhash64, Murmur32, Murmur64, SEED};
#[derive(Copy, Clone, Deserialize, PartialEq, Serialize, ValueEnum)]
#[serde(rename_all = "snake_case")]
pub enum HashGroup {
Filename,
Filetype,
Other,
}
impl std::fmt::Display for HashGroup {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
HashGroup::Filename => write!(f, "filename"),
HashGroup::Filetype => write!(f, "filetype"),
HashGroup::Other => write!(f, "other"),
}
}
}
impl Default for HashGroup {
fn default() -> Self {
Self::Other
}
}
#[derive(Deserialize, Serialize)]
struct Row {
// NOTE: The order of fields is important, as the CSV serializer copies that.
value: String,
long: Option<Murmur64>,
short: Option<Murmur32>,
#[serde(default)]
group: HashGroup,
}
struct Entry {
value: String,
long: Murmur64,
short: Murmur32,
group: HashGroup,
}
pub struct Dictionary {
entries: Vec<Entry>,
}
impl Default for Dictionary {
fn default() -> Self {
Self::new()
}
}
impl Dictionary {
pub fn new() -> Self {
Self { entries: vec![] }
}
pub async fn from_csv<R>(&mut self, r: R) -> Result<()>
where
R: AsyncRead + std::marker::Unpin + std::marker::Send,
{
let r = AsyncDeserializer::from_reader(r);
let mut records = r.into_deserialize::<Row>();
while let Some(row) = records.next().await {
let record = row?;
let value = record.value;
let long = record.long.unwrap_or_else(|| {
Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64))
});
let short = record
.short
.unwrap_or_else(|| Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED)));
let entry = Entry {
value,
long,
short,
group: record.group,
};
self.entries.push(entry);
}
Ok(())
}
pub async fn to_csv<W>(&self, w: W) -> Result<()>
where
W: AsyncWrite + std::marker::Unpin,
{
let mut w = AsyncSerializer::from_writer(w);
for (i, entry) in self.entries.iter().enumerate() {
let row = Row {
long: Some(entry.long),
short: Some(entry.short),
value: entry.value.clone(),
group: entry.group,
};
w.serialize(row)
.await
.wrap_err("Failed to serialize entry")
.with_section(|| {
let s = format!(
"Index: {}\nValue: {}\n64bit: {:#016X}\n32bit: {:#08X}\nGroup: {}",
i, entry.value, entry.long, entry.short, entry.group
);
s.header("Entry")
})?;
}
Ok(())
}
pub fn add(&mut self, value: String, group: HashGroup) {
let long = Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64));
let short = Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED));
let entry = Entry {
long,
short,
value,
group,
};
self.entries.push(entry);
}
pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.long == hash)
.map(|e| &e.value)
}
pub fn lookup_short(&self, hash: Murmur32, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.short == hash)
.map(|e| &e.value)
}
}