dtmt: Add word extraction algorithm for paths
This commit is contained in:
parent
6ada4c1c43
commit
ae1e7e5aa6
1 changed files with 296 additions and 15 deletions
|
@ -1,4 +1,4 @@
|
|||
use std::collections::HashSet;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
|
||||
|
@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command {
|
|||
)
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, ValueEnum)]
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
|
||||
#[value(rename_all = "snake_case")]
|
||||
enum Algorithm {
|
||||
Alphabetic,
|
||||
|
@ -45,6 +45,7 @@ enum Algorithm {
|
|||
Number,
|
||||
Hash32,
|
||||
Hash64,
|
||||
Paths,
|
||||
}
|
||||
|
||||
impl Algorithm {
|
||||
|
@ -55,6 +56,8 @@ impl Algorithm {
|
|||
Self::Identifier => c.is_ascii_alphabetic(),
|
||||
Self::Number => c.is_numeric(),
|
||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||
// Supposed to be handled separately
|
||||
Self::Paths => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,6 +68,8 @@ impl Algorithm {
|
|||
Self::Identifier => c.is_ascii_alphanumeric(),
|
||||
Self::Number => c.is_numeric(),
|
||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||
// Supposed to be handled separately
|
||||
Self::Paths => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -76,6 +81,8 @@ impl Algorithm {
|
|||
Self::Number => true,
|
||||
Self::Hash32 => len == 8,
|
||||
Self::Hash64 => len == 16,
|
||||
// Supposed to be handled separately
|
||||
Self::Paths => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm {
|
|||
Algorithm::Number => "number",
|
||||
Algorithm::Hash32 => "hash32",
|
||||
Algorithm::Hash64 => "hash64",
|
||||
Algorithm::Paths => "paths",
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
enum PathState {
|
||||
Begin,
|
||||
PathComponent,
|
||||
PathSeparator,
|
||||
Boundary,
|
||||
NonWord,
|
||||
End,
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(chars))]
|
||||
fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
|
||||
let mut chars = chars.peekable();
|
||||
|
||||
let mut state = PathState::Begin;
|
||||
let mut list = Vec::new();
|
||||
let mut path = Vec::new();
|
||||
let mut word = String::new();
|
||||
|
||||
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
|
||||
|
||||
'machine: loop {
|
||||
state = match state {
|
||||
PathState::Begin => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphabetic() => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
Some('/') => PathState::PathSeparator,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::PathComponent => match chars.next() {
|
||||
None => {
|
||||
path.push(word.clone());
|
||||
list.push(path.clone());
|
||||
|
||||
PathState::End
|
||||
}
|
||||
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some('/') => {
|
||||
path.push(word.clone());
|
||||
word.clear();
|
||||
|
||||
PathState::PathSeparator
|
||||
}
|
||||
Some(c) if is_boundary(c) => {
|
||||
path.push(word.clone());
|
||||
list.push(path.clone());
|
||||
|
||||
path.clear();
|
||||
word.clear();
|
||||
|
||||
PathState::Boundary
|
||||
}
|
||||
Some(_) => {
|
||||
list.push(path.clone());
|
||||
|
||||
path.clear();
|
||||
word.clear();
|
||||
|
||||
PathState::NonWord
|
||||
}
|
||||
},
|
||||
PathState::PathSeparator => match chars.next() {
|
||||
None => {
|
||||
list.push(path.clone());
|
||||
PathState::End
|
||||
}
|
||||
Some('/') => PathState::PathSeparator,
|
||||
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => {
|
||||
list.push(path.clone());
|
||||
path.clear();
|
||||
PathState::Boundary
|
||||
}
|
||||
Some(_) => {
|
||||
list.push(path.clone());
|
||||
path.clear();
|
||||
PathState::NonWord
|
||||
}
|
||||
},
|
||||
PathState::Boundary => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphabetic() => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::NonWord => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::End => {
|
||||
break 'machine;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
list
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(chars))]
|
||||
fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
|
||||
let mut chars = chars.peekable();
|
||||
|
||||
let mut state = PathState::Begin;
|
||||
let mut word = String::new();
|
||||
let mut lists = vec![HashMap::<String, usize>::new()];
|
||||
let mut index = 0;
|
||||
|
||||
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
|
||||
|
||||
'machine: loop {
|
||||
state = match state {
|
||||
PathState::Begin => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphabetic() => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
// Ignore leading path separators to not trigger the logic of advancing
|
||||
// the component count
|
||||
Some('/') => PathState::Boundary,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::PathComponent => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some('/') => PathState::PathSeparator,
|
||||
Some(c) => {
|
||||
if index > 0 && word.len() >= min_length {
|
||||
let list = &mut lists[index];
|
||||
list.entry(word.clone())
|
||||
.and_modify(|count| *count += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
word.clear();
|
||||
|
||||
index = 0;
|
||||
|
||||
if is_boundary(c) {
|
||||
PathState::Boundary
|
||||
} else {
|
||||
PathState::NonWord
|
||||
}
|
||||
}
|
||||
},
|
||||
PathState::PathSeparator => {
|
||||
if word.len() >= min_length {
|
||||
let list = &mut lists[index];
|
||||
list.entry(word.clone())
|
||||
.and_modify(|count| *count += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
word.clear();
|
||||
|
||||
index += 1;
|
||||
if lists.get(index).is_none() {
|
||||
lists.push(HashMap::new());
|
||||
}
|
||||
|
||||
// Ignore multiple separators
|
||||
while chars.next_if(|c| *c == '/').is_some() {}
|
||||
|
||||
match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => {
|
||||
index = 0;
|
||||
PathState::Boundary
|
||||
}
|
||||
Some(_) => {
|
||||
index = 0;
|
||||
PathState::NonWord
|
||||
}
|
||||
}
|
||||
}
|
||||
PathState::Boundary => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if c.is_ascii_alphabetic() => {
|
||||
word.push(c);
|
||||
PathState::PathComponent
|
||||
}
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::NonWord => match chars.next() {
|
||||
None => PathState::End,
|
||||
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||
Some(_) => PathState::NonWord,
|
||||
},
|
||||
PathState::End => {
|
||||
if word.len() >= min_length {
|
||||
let list = &mut lists[index];
|
||||
list.entry(word.clone())
|
||||
.and_modify(|count| *count += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
|
||||
break 'machine;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..lists.len() {
|
||||
print!("Word {i}, Count {i},");
|
||||
}
|
||||
println!();
|
||||
|
||||
let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
|
||||
|
||||
for (i, list) in lists.into_iter().enumerate() {
|
||||
let mut entries = list.into_iter().collect::<Vec<_>>();
|
||||
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
|
||||
|
||||
for (j, (word, count)) in entries.into_iter().enumerate() {
|
||||
if let Some(line) = lines.get_mut(j) {
|
||||
while line.len() < i {
|
||||
line.push(None);
|
||||
}
|
||||
line.push(Some((word, count)));
|
||||
} else {
|
||||
let mut line = Vec::new();
|
||||
while line.len() < i {
|
||||
line.push(None);
|
||||
}
|
||||
line.push(Some((word, count)));
|
||||
lines.push(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for line in lines.iter() {
|
||||
for cell in line.iter() {
|
||||
if let Some((word, count)) = cell {
|
||||
print!("{},{},", word, count);
|
||||
} else {
|
||||
print!(",,");
|
||||
}
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
enum State {
|
||||
Begin,
|
||||
|
@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
|||
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
||||
let mut chars = content.chars();
|
||||
|
||||
if *algorithm == Algorithm::Paths {
|
||||
algorithm_path_components(chars, min_length);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut state = State::Begin;
|
||||
let mut word = String::new();
|
||||
let mut visited = HashSet::new();
|
||||
let mut visited = HashMap::new();
|
||||
|
||||
'machine: loop {
|
||||
state = match state {
|
||||
|
@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
|||
},
|
||||
State::Word => match chars.next() {
|
||||
None => {
|
||||
if word.len() >= min_length
|
||||
&& algorithm.is_length(word.len())
|
||||
&& !visited.contains(&word)
|
||||
{
|
||||
println!("{}", &word);
|
||||
visited.insert(word.clone());
|
||||
if word.len() >= min_length && algorithm.is_length(word.len()) {
|
||||
visited
|
||||
.entry(word.clone())
|
||||
.and_modify(|v| *v += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
State::End
|
||||
}
|
||||
|
@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
|||
State::Word
|
||||
}
|
||||
Some(_) => {
|
||||
if word.len() >= min_length
|
||||
&& algorithm.is_length(word.len())
|
||||
&& !visited.contains(&word)
|
||||
{
|
||||
println!("{}", &word);
|
||||
visited.insert(word.clone());
|
||||
if word.len() >= min_length && algorithm.is_length(word.len()) {
|
||||
visited
|
||||
.entry(word.clone())
|
||||
.and_modify(|v| *v += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
word.clear();
|
||||
State::NonWord
|
||||
|
@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
|||
}
|
||||
}
|
||||
|
||||
let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
|
||||
// Reverse sides during comparison to get "highest to lowest"
|
||||
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
|
||||
|
||||
entries
|
||||
.iter()
|
||||
.for_each(|(word, count)| println!("{:016} {}", word, count));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue