dtmt: Add word extraction algorithm for paths

2023-09-22 11:46:57 +02:00 · 2023-09-22 11:46:57 +02:00 · ae1e7e5aa6
commit ae1e7e5aa6
parent 6ada4c1c43
1 changed files with 296 additions and 15 deletions
--- a/crates/dtmt/src/cmd/experiment/extract_words.rs
+++ b/crates/dtmt/src/cmd/experiment/extract_words.rs
@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::HashMap;
 use std::path::PathBuf;
 use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command {
        )
 }
-#[derive(Copy, Clone, Debug, ValueEnum)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
 #[value(rename_all = "snake_case")]
 enum Algorithm {
    Alphabetic,
@ -45,6 +45,7 @@ enum Algorithm {
    Number,
    Hash32,
    Hash64,
    Paths,
 }
 impl Algorithm {
@ -55,6 +56,8 @@ impl Algorithm {
            Self::Identifier => c.is_ascii_alphabetic(),
            Self::Number => c.is_numeric(),
            Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
            // Supposed to be handled separately
            Self::Paths => false,
        }
    }
@ -65,6 +68,8 @@ impl Algorithm {
            Self::Identifier => c.is_ascii_alphanumeric(),
            Self::Number => c.is_numeric(),
            Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
            // Supposed to be handled separately
            Self::Paths => false,
        }
    }
@ -76,6 +81,8 @@ impl Algorithm {
            Self::Number => true,
            Self::Hash32 => len == 8,
            Self::Hash64 => len == 16,
            // Supposed to be handled separately
            Self::Paths => false,
        }
    }
 }
@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm {
                Algorithm::Number => "number",
                Algorithm::Hash32 => "hash32",
                Algorithm::Hash64 => "hash64",
                Algorithm::Paths => "paths",
            }
        )
    }
 }
 #[derive(Copy, Clone, Debug)]
 enum PathState {
    Begin,
    PathComponent,
    PathSeparator,
    Boundary,
    NonWord,
    End,
 }
 #[tracing::instrument(skip(chars))]
 fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
    let mut chars = chars.peekable();
    let mut state = PathState::Begin;
    let mut list = Vec::new();
    let mut path = Vec::new();
    let mut word = String::new();
    let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
    'machine: loop {
        state = match state {
            PathState::Begin => match chars.next() {
                None => PathState::End,
                Some(c) if c.is_ascii_alphabetic() => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some(c) if is_boundary(c) => PathState::Boundary,
                Some('/') => PathState::PathSeparator,
                Some(_) => PathState::NonWord,
            },
            PathState::PathComponent => match chars.next() {
                None => {
                    path.push(word.clone());
                    list.push(path.clone());
                    PathState::End
                }
                Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some('/') => {
                    path.push(word.clone());
                    word.clear();
                    PathState::PathSeparator
                }
                Some(c) if is_boundary(c) => {
                    path.push(word.clone());
                    list.push(path.clone());
                    path.clear();
                    word.clear();
                    PathState::Boundary
                }
                Some(_) => {
                    list.push(path.clone());
                    path.clear();
                    word.clear();
                    PathState::NonWord
                }
            },
            PathState::PathSeparator => match chars.next() {
                None => {
                    list.push(path.clone());
                    PathState::End
                }
                Some('/') => PathState::PathSeparator,
                Some(c) if c.is_ascii_alphabetic() || c == '_' => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some(c) if is_boundary(c) => {
                    list.push(path.clone());
                    path.clear();
                    PathState::Boundary
                }
                Some(_) => {
                    list.push(path.clone());
                    path.clear();
                    PathState::NonWord
                }
            },
            PathState::Boundary => match chars.next() {
                None => PathState::End,
                Some(c) if c.is_ascii_alphabetic() => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some(c) if is_boundary(c) => PathState::Boundary,
                Some(_) => PathState::NonWord,
            },
            PathState::NonWord => match chars.next() {
                None => PathState::End,
                Some(c) if is_boundary(c) => PathState::Boundary,
                Some(_) => PathState::NonWord,
            },
            PathState::End => {
                break 'machine;
            }
        }
    }
    list
 }
 #[tracing::instrument(skip(chars))]
 fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
    let mut chars = chars.peekable();
    let mut state = PathState::Begin;
    let mut word = String::new();
    let mut lists = vec![HashMap::<String, usize>::new()];
    let mut index = 0;
    let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
    'machine: loop {
        state = match state {
            PathState::Begin => match chars.next() {
                None => PathState::End,
                Some(c) if c.is_ascii_alphabetic() => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some(c) if is_boundary(c) => PathState::Boundary,
                // Ignore leading path separators to not trigger the logic of advancing
                // the component count
                Some('/') => PathState::Boundary,
                Some(_) => PathState::NonWord,
            },
            PathState::PathComponent => match chars.next() {
                None => PathState::End,
                Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some('/') => PathState::PathSeparator,
                Some(c) => {
                    if index > 0 && word.len() >= min_length {
                        let list = &mut lists[index];
                        list.entry(word.clone())
                            .and_modify(|count| *count += 1)
                            .or_insert(1);
                    }
                    word.clear();
                    index = 0;
                    if is_boundary(c) {
                        PathState::Boundary
                    } else {
                        PathState::NonWord
                    }
                }
            },
            PathState::PathSeparator => {
                if word.len() >= min_length {
                    let list = &mut lists[index];
                    list.entry(word.clone())
                        .and_modify(|count| *count += 1)
                        .or_insert(1);
                }
                word.clear();
                index += 1;
                if lists.get(index).is_none() {
                    lists.push(HashMap::new());
                }
                // Ignore multiple separators
                while chars.next_if(|c| *c == '/').is_some() {}
                match chars.next() {
                    None => PathState::End,
                    Some(c) if c.is_ascii_alphabetic() || c == '_' => {
                        word.push(c);
                        PathState::PathComponent
                    }
                    Some(c) if is_boundary(c) => {
                        index = 0;
                        PathState::Boundary
                    }
                    Some(_) => {
                        index = 0;
                        PathState::NonWord
                    }
                }
            }
            PathState::Boundary => match chars.next() {
                None => PathState::End,
                Some(c) if c.is_ascii_alphabetic() => {
                    word.push(c);
                    PathState::PathComponent
                }
                Some(c) if is_boundary(c) => PathState::Boundary,
                Some(_) => PathState::NonWord,
            },
            PathState::NonWord => match chars.next() {
                None => PathState::End,
                Some(c) if is_boundary(c) => PathState::Boundary,
                Some(_) => PathState::NonWord,
            },
            PathState::End => {
                if word.len() >= min_length {
                    let list = &mut lists[index];
                    list.entry(word.clone())
                        .and_modify(|count| *count += 1)
                        .or_insert(1);
                }
                break 'machine;
            }
        }
    }
    for i in 0..lists.len() {
        print!("Word {i}, Count {i},");
    }
    println!();
    let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
    for (i, list) in lists.into_iter().enumerate() {
        let mut entries = list.into_iter().collect::<Vec<_>>();
        entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
        for (j, (word, count)) in entries.into_iter().enumerate() {
            if let Some(line) = lines.get_mut(j) {
                while line.len() < i {
                    line.push(None);
                }
                line.push(Some((word, count)));
            } else {
                let mut line = Vec::new();
                while line.len() < i {
                    line.push(None);
                }
                line.push(Some((word, count)));
                lines.push(line);
            }
        }
    }
    for line in lines.iter() {
        for cell in line.iter() {
            if let Some((word, count)) = cell {
                print!("{},{},", word, count);
            } else {
                print!(",,");
            }
        }
        println!();
    }
 }
 #[derive(Copy, Clone, Debug)]
 enum State {
    Begin,
@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
        .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
    let mut chars = content.chars();
    if *algorithm == Algorithm::Paths {
        algorithm_path_components(chars, min_length);
        return Ok(());
    }
    let mut state = State::Begin;
    let mut word = String::new();
-    let mut visited = HashSet::new();
+    let mut visited = HashMap::new();
    'machine: loop {
        state = match state {
@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
            },
            State::Word => match chars.next() {
                None => {
-                    if word.len() >= min_length
+                    if word.len() >= min_length && algorithm.is_length(word.len()) {
-                        && algorithm.is_length(word.len())
+                        visited
-                        && !visited.contains(&word)
+                            .entry(word.clone())
-                    {
+                            .and_modify(|v| *v += 1)
-                        println!("{}", &word);
+                            .or_insert(1);
                        visited.insert(word.clone());
                    }
                    State::End
                }
@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                    State::Word
                }
                Some(_) => {
-                    if word.len() >= min_length
+                    if word.len() >= min_length && algorithm.is_length(word.len()) {
-                        && algorithm.is_length(word.len())
+                        visited
-                        && !visited.contains(&word)
+                            .entry(word.clone())
-                    {
+                            .and_modify(|v| *v += 1)
-                        println!("{}", &word);
+                            .or_insert(1);
                        visited.insert(word.clone());
                    }
                    word.clear();
                    State::NonWord
@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
        }
    }
    let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
    // Reverse sides during comparison to get "highest to lowest"
    entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
    entries
        .iter()
        .for_each(|(word, count)| println!("{:016} {}", word, count));
    Ok(())
 }