dtmt: Add word extraction algorithm for paths

This commit is contained in:
Lucas Schwiderski 2023-09-22 11:46:57 +02:00
parent 6ada4c1c43
commit ae1e7e5aa6
Signed by: lucas
GPG key ID: AA12679AAA6DF4D8

View file

@ -1,4 +1,4 @@
use std::collections::HashSet; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command {
) )
} }
#[derive(Copy, Clone, Debug, ValueEnum)] #[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
#[value(rename_all = "snake_case")] #[value(rename_all = "snake_case")]
enum Algorithm { enum Algorithm {
Alphabetic, Alphabetic,
@ -45,6 +45,7 @@ enum Algorithm {
Number, Number,
Hash32, Hash32,
Hash64, Hash64,
Paths,
} }
impl Algorithm { impl Algorithm {
@ -55,6 +56,8 @@ impl Algorithm {
Self::Identifier => c.is_ascii_alphabetic(), Self::Identifier => c.is_ascii_alphabetic(),
Self::Number => c.is_numeric(), Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
// Supposed to be handled separately
Self::Paths => false,
} }
} }
@ -65,6 +68,8 @@ impl Algorithm {
Self::Identifier => c.is_ascii_alphanumeric(), Self::Identifier => c.is_ascii_alphanumeric(),
Self::Number => c.is_numeric(), Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
// Supposed to be handled separately
Self::Paths => false,
} }
} }
@ -76,6 +81,8 @@ impl Algorithm {
Self::Number => true, Self::Number => true,
Self::Hash32 => len == 8, Self::Hash32 => len == 8,
Self::Hash64 => len == 16, Self::Hash64 => len == 16,
// Supposed to be handled separately
Self::Paths => false,
} }
} }
} }
@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm {
Algorithm::Number => "number", Algorithm::Number => "number",
Algorithm::Hash32 => "hash32", Algorithm::Hash32 => "hash32",
Algorithm::Hash64 => "hash64", Algorithm::Hash64 => "hash64",
Algorithm::Paths => "paths",
} }
) )
} }
} }
#[derive(Copy, Clone, Debug)]
enum PathState {
Begin,
PathComponent,
PathSeparator,
Boundary,
NonWord,
End,
}
#[tracing::instrument(skip(chars))]
fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
let mut chars = chars.peekable();
let mut state = PathState::Begin;
let mut list = Vec::new();
let mut path = Vec::new();
let mut word = String::new();
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
'machine: loop {
state = match state {
PathState::Begin => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some('/') => PathState::PathSeparator,
Some(_) => PathState::NonWord,
},
PathState::PathComponent => match chars.next() {
None => {
path.push(word.clone());
list.push(path.clone());
PathState::End
}
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some('/') => {
path.push(word.clone());
word.clear();
PathState::PathSeparator
}
Some(c) if is_boundary(c) => {
path.push(word.clone());
list.push(path.clone());
path.clear();
word.clear();
PathState::Boundary
}
Some(_) => {
list.push(path.clone());
path.clear();
word.clear();
PathState::NonWord
}
},
PathState::PathSeparator => match chars.next() {
None => {
list.push(path.clone());
PathState::End
}
Some('/') => PathState::PathSeparator,
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => {
list.push(path.clone());
path.clear();
PathState::Boundary
}
Some(_) => {
list.push(path.clone());
path.clear();
PathState::NonWord
}
},
PathState::Boundary => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::NonWord => match chars.next() {
None => PathState::End,
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::End => {
break 'machine;
}
}
}
list
}
#[tracing::instrument(skip(chars))]
fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
let mut chars = chars.peekable();
let mut state = PathState::Begin;
let mut word = String::new();
let mut lists = vec![HashMap::<String, usize>::new()];
let mut index = 0;
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
'machine: loop {
state = match state {
PathState::Begin => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
// Ignore leading path separators to not trigger the logic of advancing
// the component count
Some('/') => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::PathComponent => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some('/') => PathState::PathSeparator,
Some(c) => {
if index > 0 && word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
word.clear();
index = 0;
if is_boundary(c) {
PathState::Boundary
} else {
PathState::NonWord
}
}
},
PathState::PathSeparator => {
if word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
word.clear();
index += 1;
if lists.get(index).is_none() {
lists.push(HashMap::new());
}
// Ignore multiple separators
while chars.next_if(|c| *c == '/').is_some() {}
match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => {
index = 0;
PathState::Boundary
}
Some(_) => {
index = 0;
PathState::NonWord
}
}
}
PathState::Boundary => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::NonWord => match chars.next() {
None => PathState::End,
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::End => {
if word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
break 'machine;
}
}
}
for i in 0..lists.len() {
print!("Word {i}, Count {i},");
}
println!();
let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
for (i, list) in lists.into_iter().enumerate() {
let mut entries = list.into_iter().collect::<Vec<_>>();
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
for (j, (word, count)) in entries.into_iter().enumerate() {
if let Some(line) = lines.get_mut(j) {
while line.len() < i {
line.push(None);
}
line.push(Some((word, count)));
} else {
let mut line = Vec::new();
while line.len() < i {
line.push(None);
}
line.push(Some((word, count)));
lines.push(line);
}
}
}
for line in lines.iter() {
for cell in line.iter() {
if let Some((word, count)) = cell {
print!("{},{},", word, count);
} else {
print!(",,");
}
}
println!();
}
}
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
enum State { enum State {
Begin, Begin,
@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
let mut chars = content.chars(); let mut chars = content.chars();
if *algorithm == Algorithm::Paths {
algorithm_path_components(chars, min_length);
return Ok(());
}
let mut state = State::Begin; let mut state = State::Begin;
let mut word = String::new(); let mut word = String::new();
let mut visited = HashSet::new(); let mut visited = HashMap::new();
'machine: loop { 'machine: loop {
state = match state { state = match state {
@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
}, },
State::Word => match chars.next() { State::Word => match chars.next() {
None => { None => {
if word.len() >= min_length if word.len() >= min_length && algorithm.is_length(word.len()) {
&& algorithm.is_length(word.len()) visited
&& !visited.contains(&word) .entry(word.clone())
{ .and_modify(|v| *v += 1)
println!("{}", &word); .or_insert(1);
visited.insert(word.clone());
} }
State::End State::End
} }
@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
State::Word State::Word
} }
Some(_) => { Some(_) => {
if word.len() >= min_length if word.len() >= min_length && algorithm.is_length(word.len()) {
&& algorithm.is_length(word.len()) visited
&& !visited.contains(&word) .entry(word.clone())
{ .and_modify(|v| *v += 1)
println!("{}", &word); .or_insert(1);
visited.insert(word.clone());
} }
word.clear(); word.clear();
State::NonWord State::NonWord
@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
} }
} }
let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
// Reverse sides during comparison to get "highest to lowest"
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
entries
.iter()
.for_each(|(word, count)| println!("{:016} {}", word, count));
Ok(()) Ok(())
} }