diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs index 512038d..1a8cda5 100644 --- a/crates/dtmt/src/cmd/experiment/extract_words.rs +++ b/crates/dtmt/src/cmd/experiment/extract_words.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::HashMap; use std::path::PathBuf; use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; @@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command { ) } -#[derive(Copy, Clone, Debug, ValueEnum)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)] #[value(rename_all = "snake_case")] enum Algorithm { Alphabetic, @@ -45,6 +45,7 @@ enum Algorithm { Number, Hash32, Hash64, + Paths, } impl Algorithm { @@ -55,6 +56,8 @@ impl Algorithm { Self::Identifier => c.is_ascii_alphabetic(), Self::Number => c.is_numeric(), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, } } @@ -65,6 +68,8 @@ impl Algorithm { Self::Identifier => c.is_ascii_alphanumeric(), Self::Number => c.is_numeric(), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, } } @@ -76,6 +81,8 @@ impl Algorithm { Self::Number => true, Self::Hash32 => len == 8, Self::Hash64 => len == 16, + // Supposed to be handled separately + Self::Paths => false, } } } @@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm { Algorithm::Number => "number", Algorithm::Hash32 => "hash32", Algorithm::Hash64 => "hash64", + Algorithm::Paths => "paths", } ) } } +#[derive(Copy, Clone, Debug)] +enum PathState { + Begin, + PathComponent, + PathSeparator, + Boundary, + NonWord, + End, +} + +#[tracing::instrument(skip(chars))] +fn extract_paths(chars: impl Iterator) -> Vec> { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut list = Vec::new(); + let mut path = Vec::new(); + let mut word = String::new(); + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some('/') => PathState::PathSeparator, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => { + path.push(word.clone()); + list.push(path.clone()); + + PathState::End + } + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => { + path.push(word.clone()); + word.clear(); + + PathState::PathSeparator + } + Some(c) if is_boundary(c) => { + path.push(word.clone()); + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::NonWord + } + }, + PathState::PathSeparator => match chars.next() { + None => { + list.push(path.clone()); + PathState::End + } + Some('/') => PathState::PathSeparator, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + list.push(path.clone()); + path.clear(); + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + path.clear(); + PathState::NonWord + } + }, + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + break 'machine; + } + } + } + + list +} + +#[tracing::instrument(skip(chars))] +fn algorithm_path_components(chars: impl Iterator, min_length: usize) { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut word = String::new(); + let mut lists = vec![HashMap::::new()]; + let mut index = 0; + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + // Ignore leading path separators to not trigger the logic of advancing + // the component count + Some('/') => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => PathState::PathSeparator, + Some(c) => { + if index > 0 && word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index = 0; + + if is_boundary(c) { + PathState::Boundary + } else { + PathState::NonWord + } + } + }, + PathState::PathSeparator => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index += 1; + if lists.get(index).is_none() { + lists.push(HashMap::new()); + } + + // Ignore multiple separators + while chars.next_if(|c| *c == '/').is_some() {} + + match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + index = 0; + PathState::Boundary + } + Some(_) => { + index = 0; + PathState::NonWord + } + } + } + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + break 'machine; + } + } + } + + for i in 0..lists.len() { + print!("Word {i}, Count {i},"); + } + println!(); + + let mut lines: Vec>> = Vec::new(); + + for (i, list) in lists.into_iter().enumerate() { + let mut entries = list.into_iter().collect::>(); + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + for (j, (word, count)) in entries.into_iter().enumerate() { + if let Some(line) = lines.get_mut(j) { + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + } else { + let mut line = Vec::new(); + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + lines.push(line); + } + } + } + + for line in lines.iter() { + for cell in line.iter() { + if let Some((word, count)) = cell { + print!("{},{},", word, count); + } else { + print!(",,"); + } + } + println!(); + } +} + #[derive(Copy, Clone, Debug)] enum State { Begin, @@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; let mut chars = content.chars(); + if *algorithm == Algorithm::Paths { + algorithm_path_components(chars, min_length); + return Ok(()); + } + let mut state = State::Begin; let mut word = String::new(); - let mut visited = HashSet::new(); + let mut visited = HashMap::new(); 'machine: loop { state = match state { @@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> }, State::Word => match chars.next() { None => { - if word.len() >= min_length - && algorithm.is_length(word.len()) - && !visited.contains(&word) - { - println!("{}", &word); - visited.insert(word.clone()); + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); } State::End } @@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> State::Word } Some(_) => { - if word.len() >= min_length - && algorithm.is_length(word.len()) - && !visited.contains(&word) - { - println!("{}", &word); - visited.insert(word.clone()); + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); } word.clear(); State::NonWord @@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> } } + let mut entries: Vec<(String, usize)> = visited.into_iter().collect(); + // Reverse sides during comparison to get "highest to lowest" + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + entries + .iter() + .for_each(|(word, count)| println!("{:016} {}", word, count)); + Ok(()) }