diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index 10932d0..c00f02e 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -2,6 +2,10 @@ == [Unreleased] +=== Added + +- show status after adding dictionary entries + == [v0.2.0] - 2022-12-28 === Added diff --git a/crates/dtmt/src/cmd/dictionary.rs b/crates/dtmt/src/cmd/dictionary.rs index dc09ad6..14b471d 100644 --- a/crates/dtmt/src/cmd/dictionary.rs +++ b/crates/dtmt/src/cmd/dictionary.rs @@ -112,13 +112,24 @@ pub(crate) async fn run(ctx: Arc>, matches: &ArgMatches) -> BufReader::new(Box::new(f)) }; + let mut added = 0; + let mut skipped = 0; + let lines: Vec<_> = LinesStream::new(r.lines()).collect().await; - { + let total = { let mut ctx = ctx.write().await; for line in lines.into_iter() { - ctx.lookup.add(line?, (*group).into()); + let value = line?; + if ctx.lookup.find(&value, (*group).into()).is_some() { + skipped += 1; + } else { + ctx.lookup.add(value, (*group).into()); + added += 1; + } } - } + + ctx.lookup.len() + }; let out_path = matches .get_one::("dictionary") @@ -139,7 +150,15 @@ pub(crate) async fn run(ctx: Arc>, matches: &ArgMatches) -> .lookup .to_csv(f) .await - .wrap_err("Failed to write dictionary to disk") + .wrap_err("Failed to write dictionary to disk")?; + + tracing::info!( + "Added {} entries, skipped {} duplicates. Total now {}.", + added, + skipped, + total + ); + Ok(()) } Some(("save", _)) => { let out_path = matches diff --git a/lib/sdk/src/murmur/dictionary.rs b/lib/sdk/src/murmur/dictionary.rs index abf90b7..322dded 100644 --- a/lib/sdk/src/murmur/dictionary.rs +++ b/lib/sdk/src/murmur/dictionary.rs @@ -48,7 +48,7 @@ struct Row { group: HashGroup, } -struct Entry { +pub struct Entry { value: String, long: Murmur64, short: Murmur32, @@ -143,6 +143,12 @@ impl Dictionary { self.entries.push(entry); } + pub fn find(&mut self, value: &String, group: HashGroup) -> Option<&Entry> { + self.entries + .iter() + .find(|e| e.value == *value && e.group == group) + } + pub fn lookup(&self, hash: Murmur64, group: HashGroup) -> Option<&String> { self.entries .iter() @@ -158,4 +164,12 @@ impl Dictionary { .find(|e| e.short == hash) .map(|e| &e.value) } + + pub fn len(&self) -> usize { + self.entries.len() + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } }