diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs new file mode 100644 index 0000000..512038d --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/extract_words.rs @@ -0,0 +1,182 @@ +use std::collections::HashSet; +use std::path::PathBuf; + +use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; +use color_eyre::eyre::Context; +use color_eyre::Result; +use tokio::fs; + +pub(crate) fn command_definition() -> Command { + Command::new("extract-words") + .about( + "Extract unique alphanumeric sequences that match common identifier rules from the given file. \ + Only ASCII is supported.", + ) + .arg( + Arg::new("file") + .required(true) + .value_parser(value_parser!(PathBuf)) + .help("Path to the file to extract words from."), + ) + .arg( + Arg::new("min-length") + .help("Minimum length to consider a word.") + .long("min-length") + .short('m') + .default_value("3") + .value_parser(value_parser!(usize)) + ) + .arg( + Arg::new("algorithm") + .help("The algorithm to determine matching words") + .long("algorithm") + .short('a') + .default_value("identifier") + .value_parser(value_parser!(Algorithm)) + ) +} + +#[derive(Copy, Clone, Debug, ValueEnum)] +#[value(rename_all = "snake_case")] +enum Algorithm { + Alphabetic, + Alphanumeric, + Identifier, + Number, + Hash32, + Hash64, +} + +impl Algorithm { + fn is_start(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphabetic(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } + + fn is_body(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphanumeric(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } + + fn is_length(&self, len: usize) -> bool { + match self { + Self::Alphabetic => true, + Self::Alphanumeric => true, + Self::Identifier => true, + Self::Number => true, + Self::Hash32 => len == 8, + Self::Hash64 => len == 16, + } + } +} + +impl std::fmt::Display for Algorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + Algorithm::Alphabetic => "alphabetic", + Algorithm::Alphanumeric => "alphanumeric", + Algorithm::Identifier => "identifier", + Algorithm::Number => "number", + Algorithm::Hash32 => "hash32", + Algorithm::Hash64 => "hash64", + } + ) + } +} + +#[derive(Copy, Clone, Debug)] +enum State { + Begin, + NonWord, + Word, + End, +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let path = matches + .get_one::("file") + .expect("missing required parameter"); + + let algorithm = matches + .get_one::("algorithm") + .expect("parameter has default"); + + let min_length = matches + .get_one::("min-length") + .copied() + .expect("paramter has default"); + + let content = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + let mut chars = content.chars(); + + let mut state = State::Begin; + let mut word = String::new(); + let mut visited = HashSet::new(); + + 'machine: loop { + state = match state { + State::Begin => match chars.next() { + None => State::End, + Some(c) if algorithm.is_start(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::End => break 'machine, + State::NonWord => match chars.next() { + None => State::End, + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::Word => match chars.next() { + None => { + if word.len() >= min_length + && algorithm.is_length(word.len()) + && !visited.contains(&word) + { + println!("{}", &word); + visited.insert(word.clone()); + } + State::End + } + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => { + if word.len() >= min_length + && algorithm.is_length(word.len()) + && !visited.contains(&word) + { + println!("{}", &word); + visited.insert(word.clone()); + } + word.clear(); + State::NonWord + } + }, + } + } + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs index b29f83a..51e5fc7 100644 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -1,15 +1,19 @@ use clap::{ArgMatches, Command}; use color_eyre::Result; +mod extract_words; + pub(crate) fn command_definition() -> Command { Command::new("experiment") .subcommand_required(true) .about("A collection of utilities and experiments.") + .subcommand(extract_words::command_definition()) } #[tracing::instrument(skip_all)] pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { match matches.subcommand() { + Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above" ),