dtmt/lib/sdk/src/murmur/dictionary.rs
Lucas Schwiderski 1d08498131
feat(dtmt): Add command to print the dictionary
This is mostly helpful to check/debug whether the internal dictionary
actually contains the expected data. For manually looking through the
entire dictionary, opening the CSV file is still more convenient.
2023-02-17 22:51:46 +01:00

197 lines
4.8 KiB
Rust

use color_eyre::{eyre::Context, Help, Result, SectionExt};
use csv_async::{AsyncDeserializer, AsyncSerializer};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_stream::StreamExt;
use super::{murmurhash64, Murmur32, Murmur64, SEED};
#[derive(Copy, Clone, Deserialize, PartialEq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum HashGroup {
Filename,
Filetype,
Strings,
Other,
}
impl HashGroup {
pub fn all() -> [Self; 3] {
[Self::Filename, Self::Filetype, Self::Other]
}
}
impl std::fmt::Display for HashGroup {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
HashGroup::Filename => write!(f, "filename"),
HashGroup::Filetype => write!(f, "filetype"),
HashGroup::Strings => write!(f, "strings"),
HashGroup::Other => write!(f, "other"),
}
}
}
impl Default for HashGroup {
fn default() -> Self {
Self::Other
}
}
#[derive(Deserialize, Serialize)]
struct Row {
// NOTE: The order of fields is important, as the CSV serializer copies that.
value: String,
long: Option<Murmur64>,
short: Option<Murmur32>,
#[serde(default)]
group: HashGroup,
}
pub struct Entry {
value: String,
long: Murmur64,
short: Murmur32,
group: HashGroup,
}
impl Entry {
pub fn value(&self) -> &String {
&self.value
}
pub fn long(&self) -> Murmur64 {
self.long
}
pub fn short(&self) -> Murmur32 {
self.short
}
pub fn group(&self) -> HashGroup {
self.group
}
}
pub struct Dictionary {
entries: Vec<Entry>,
}
impl Default for Dictionary {
fn default() -> Self {
Self::new()
}
}
impl Dictionary {
pub fn new() -> Self {
Self { entries: vec![] }
}
pub async fn from_csv<R>(&mut self, r: R) -> Result<()>
where
R: AsyncRead + std::marker::Unpin + std::marker::Send,
{
let r = AsyncDeserializer::from_reader(r);
let mut records = r.into_deserialize::<Row>();
while let Some(row) = records.next().await {
let record = row?;
let value = record.value;
let long = record.long.unwrap_or_else(|| {
Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64))
});
let short = record
.short
.unwrap_or_else(|| Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED)));
let entry = Entry {
value,
long,
short,
group: record.group,
};
self.entries.push(entry);
}
Ok(())
}
pub async fn to_csv<W>(&self, w: W) -> Result<()>
where
W: AsyncWrite + std::marker::Unpin,
{
let mut w = AsyncSerializer::from_writer(w);
for (i, entry) in self.entries.iter().enumerate() {
let row = Row {
long: Some(entry.long),
short: Some(entry.short),
value: entry.value.clone(),
group: entry.group,
};
w.serialize(row)
.await
.wrap_err("Failed to serialize entry")
.with_section(|| {
let s = format!(
"Index: {}\nValue: {}\n64bit: {:#016X}\n32bit: {:#08X}\nGroup: {}",
i, entry.value, entry.long, entry.short, entry.group
);
s.header("Entry")
})?;
}
Ok(())
}
pub fn add(&mut self, value: String, group: HashGroup) {
let long = Murmur64::from(murmurhash64::hash(value.as_bytes(), SEED as u64));
let short = Murmur32::from(murmurhash64::hash32(value.as_bytes(), SEED));
let entry = Entry {
long,
short,
value,
group,
};
self.entries.push(entry);
}
pub fn find(&mut self, value: &String, group: HashGroup) -> Option<&Entry> {
self.entries
.iter()
.find(|e| e.value == *value && e.group == group)
}
pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.long == hash)
.map(|e| &e.value)
}
pub fn lookup_short(&self, hash: Murmur32, group: HashGroup) -> Option<&String> {
self.entries
.iter()
.filter(|e| e.group == group)
.find(|e| e.short == hash)
.map(|e| &e.value)
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn entries(&self) -> &Vec<Entry> {
&self.entries
}
}