dtmt: Add word extraction algorithm for paths
This commit is contained in:
parent
6ada4c1c43
commit
ae1e7e5aa6
1 changed files with 296 additions and 15 deletions
|
@ -1,4 +1,4 @@
|
||||||
use std::collections::HashSet;
|
use std::collections::HashMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
|
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
|
||||||
|
@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, ValueEnum)]
|
#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
|
||||||
#[value(rename_all = "snake_case")]
|
#[value(rename_all = "snake_case")]
|
||||||
enum Algorithm {
|
enum Algorithm {
|
||||||
Alphabetic,
|
Alphabetic,
|
||||||
|
@ -45,6 +45,7 @@ enum Algorithm {
|
||||||
Number,
|
Number,
|
||||||
Hash32,
|
Hash32,
|
||||||
Hash64,
|
Hash64,
|
||||||
|
Paths,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Algorithm {
|
impl Algorithm {
|
||||||
|
@ -55,6 +56,8 @@ impl Algorithm {
|
||||||
Self::Identifier => c.is_ascii_alphabetic(),
|
Self::Identifier => c.is_ascii_alphabetic(),
|
||||||
Self::Number => c.is_numeric(),
|
Self::Number => c.is_numeric(),
|
||||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||||
|
// Supposed to be handled separately
|
||||||
|
Self::Paths => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,6 +68,8 @@ impl Algorithm {
|
||||||
Self::Identifier => c.is_ascii_alphanumeric(),
|
Self::Identifier => c.is_ascii_alphanumeric(),
|
||||||
Self::Number => c.is_numeric(),
|
Self::Number => c.is_numeric(),
|
||||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||||
|
// Supposed to be handled separately
|
||||||
|
Self::Paths => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -76,6 +81,8 @@ impl Algorithm {
|
||||||
Self::Number => true,
|
Self::Number => true,
|
||||||
Self::Hash32 => len == 8,
|
Self::Hash32 => len == 8,
|
||||||
Self::Hash64 => len == 16,
|
Self::Hash64 => len == 16,
|
||||||
|
// Supposed to be handled separately
|
||||||
|
Self::Paths => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm {
|
||||||
Algorithm::Number => "number",
|
Algorithm::Number => "number",
|
||||||
Algorithm::Hash32 => "hash32",
|
Algorithm::Hash32 => "hash32",
|
||||||
Algorithm::Hash64 => "hash64",
|
Algorithm::Hash64 => "hash64",
|
||||||
|
Algorithm::Paths => "paths",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
enum PathState {
|
||||||
|
Begin,
|
||||||
|
PathComponent,
|
||||||
|
PathSeparator,
|
||||||
|
Boundary,
|
||||||
|
NonWord,
|
||||||
|
End,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(skip(chars))]
|
||||||
|
fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
|
||||||
|
let mut chars = chars.peekable();
|
||||||
|
|
||||||
|
let mut state = PathState::Begin;
|
||||||
|
let mut list = Vec::new();
|
||||||
|
let mut path = Vec::new();
|
||||||
|
let mut word = String::new();
|
||||||
|
|
||||||
|
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
|
||||||
|
|
||||||
|
'machine: loop {
|
||||||
|
state = match state {
|
||||||
|
PathState::Begin => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
Some('/') => PathState::PathSeparator,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::PathComponent => match chars.next() {
|
||||||
|
None => {
|
||||||
|
path.push(word.clone());
|
||||||
|
list.push(path.clone());
|
||||||
|
|
||||||
|
PathState::End
|
||||||
|
}
|
||||||
|
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some('/') => {
|
||||||
|
path.push(word.clone());
|
||||||
|
word.clear();
|
||||||
|
|
||||||
|
PathState::PathSeparator
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => {
|
||||||
|
path.push(word.clone());
|
||||||
|
list.push(path.clone());
|
||||||
|
|
||||||
|
path.clear();
|
||||||
|
word.clear();
|
||||||
|
|
||||||
|
PathState::Boundary
|
||||||
|
}
|
||||||
|
Some(_) => {
|
||||||
|
list.push(path.clone());
|
||||||
|
|
||||||
|
path.clear();
|
||||||
|
word.clear();
|
||||||
|
|
||||||
|
PathState::NonWord
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PathState::PathSeparator => match chars.next() {
|
||||||
|
None => {
|
||||||
|
list.push(path.clone());
|
||||||
|
PathState::End
|
||||||
|
}
|
||||||
|
Some('/') => PathState::PathSeparator,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => {
|
||||||
|
list.push(path.clone());
|
||||||
|
path.clear();
|
||||||
|
PathState::Boundary
|
||||||
|
}
|
||||||
|
Some(_) => {
|
||||||
|
list.push(path.clone());
|
||||||
|
path.clear();
|
||||||
|
PathState::NonWord
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PathState::Boundary => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::NonWord => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::End => {
|
||||||
|
break 'machine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
list
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(skip(chars))]
|
||||||
|
fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
|
||||||
|
let mut chars = chars.peekable();
|
||||||
|
|
||||||
|
let mut state = PathState::Begin;
|
||||||
|
let mut word = String::new();
|
||||||
|
let mut lists = vec![HashMap::<String, usize>::new()];
|
||||||
|
let mut index = 0;
|
||||||
|
|
||||||
|
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
|
||||||
|
|
||||||
|
'machine: loop {
|
||||||
|
state = match state {
|
||||||
|
PathState::Begin => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
// Ignore leading path separators to not trigger the logic of advancing
|
||||||
|
// the component count
|
||||||
|
Some('/') => PathState::Boundary,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::PathComponent => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some('/') => PathState::PathSeparator,
|
||||||
|
Some(c) => {
|
||||||
|
if index > 0 && word.len() >= min_length {
|
||||||
|
let list = &mut lists[index];
|
||||||
|
list.entry(word.clone())
|
||||||
|
.and_modify(|count| *count += 1)
|
||||||
|
.or_insert(1);
|
||||||
|
}
|
||||||
|
word.clear();
|
||||||
|
|
||||||
|
index = 0;
|
||||||
|
|
||||||
|
if is_boundary(c) {
|
||||||
|
PathState::Boundary
|
||||||
|
} else {
|
||||||
|
PathState::NonWord
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PathState::PathSeparator => {
|
||||||
|
if word.len() >= min_length {
|
||||||
|
let list = &mut lists[index];
|
||||||
|
list.entry(word.clone())
|
||||||
|
.and_modify(|count| *count += 1)
|
||||||
|
.or_insert(1);
|
||||||
|
}
|
||||||
|
word.clear();
|
||||||
|
|
||||||
|
index += 1;
|
||||||
|
if lists.get(index).is_none() {
|
||||||
|
lists.push(HashMap::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore multiple separators
|
||||||
|
while chars.next_if(|c| *c == '/').is_some() {}
|
||||||
|
|
||||||
|
match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => {
|
||||||
|
index = 0;
|
||||||
|
PathState::Boundary
|
||||||
|
}
|
||||||
|
Some(_) => {
|
||||||
|
index = 0;
|
||||||
|
PathState::NonWord
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PathState::Boundary => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if c.is_ascii_alphabetic() => {
|
||||||
|
word.push(c);
|
||||||
|
PathState::PathComponent
|
||||||
|
}
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::NonWord => match chars.next() {
|
||||||
|
None => PathState::End,
|
||||||
|
Some(c) if is_boundary(c) => PathState::Boundary,
|
||||||
|
Some(_) => PathState::NonWord,
|
||||||
|
},
|
||||||
|
PathState::End => {
|
||||||
|
if word.len() >= min_length {
|
||||||
|
let list = &mut lists[index];
|
||||||
|
list.entry(word.clone())
|
||||||
|
.and_modify(|count| *count += 1)
|
||||||
|
.or_insert(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
break 'machine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..lists.len() {
|
||||||
|
print!("Word {i}, Count {i},");
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
|
||||||
|
let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
|
||||||
|
|
||||||
|
for (i, list) in lists.into_iter().enumerate() {
|
||||||
|
let mut entries = list.into_iter().collect::<Vec<_>>();
|
||||||
|
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
|
||||||
|
|
||||||
|
for (j, (word, count)) in entries.into_iter().enumerate() {
|
||||||
|
if let Some(line) = lines.get_mut(j) {
|
||||||
|
while line.len() < i {
|
||||||
|
line.push(None);
|
||||||
|
}
|
||||||
|
line.push(Some((word, count)));
|
||||||
|
} else {
|
||||||
|
let mut line = Vec::new();
|
||||||
|
while line.len() < i {
|
||||||
|
line.push(None);
|
||||||
|
}
|
||||||
|
line.push(Some((word, count)));
|
||||||
|
lines.push(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for line in lines.iter() {
|
||||||
|
for cell in line.iter() {
|
||||||
|
if let Some((word, count)) = cell {
|
||||||
|
print!("{},{},", word, count);
|
||||||
|
} else {
|
||||||
|
print!(",,");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Copy, Clone, Debug)]
|
||||||
enum State {
|
enum State {
|
||||||
Begin,
|
Begin,
|
||||||
|
@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
||||||
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
||||||
let mut chars = content.chars();
|
let mut chars = content.chars();
|
||||||
|
|
||||||
|
if *algorithm == Algorithm::Paths {
|
||||||
|
algorithm_path_components(chars, min_length);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let mut state = State::Begin;
|
let mut state = State::Begin;
|
||||||
let mut word = String::new();
|
let mut word = String::new();
|
||||||
let mut visited = HashSet::new();
|
let mut visited = HashMap::new();
|
||||||
|
|
||||||
'machine: loop {
|
'machine: loop {
|
||||||
state = match state {
|
state = match state {
|
||||||
|
@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
||||||
},
|
},
|
||||||
State::Word => match chars.next() {
|
State::Word => match chars.next() {
|
||||||
None => {
|
None => {
|
||||||
if word.len() >= min_length
|
if word.len() >= min_length && algorithm.is_length(word.len()) {
|
||||||
&& algorithm.is_length(word.len())
|
visited
|
||||||
&& !visited.contains(&word)
|
.entry(word.clone())
|
||||||
{
|
.and_modify(|v| *v += 1)
|
||||||
println!("{}", &word);
|
.or_insert(1);
|
||||||
visited.insert(word.clone());
|
|
||||||
}
|
}
|
||||||
State::End
|
State::End
|
||||||
}
|
}
|
||||||
|
@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
||||||
State::Word
|
State::Word
|
||||||
}
|
}
|
||||||
Some(_) => {
|
Some(_) => {
|
||||||
if word.len() >= min_length
|
if word.len() >= min_length && algorithm.is_length(word.len()) {
|
||||||
&& algorithm.is_length(word.len())
|
visited
|
||||||
&& !visited.contains(&word)
|
.entry(word.clone())
|
||||||
{
|
.and_modify(|v| *v += 1)
|
||||||
println!("{}", &word);
|
.or_insert(1);
|
||||||
visited.insert(word.clone());
|
|
||||||
}
|
}
|
||||||
word.clear();
|
word.clear();
|
||||||
State::NonWord
|
State::NonWord
|
||||||
|
@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
|
||||||
|
// Reverse sides during comparison to get "highest to lowest"
|
||||||
|
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
|
||||||
|
|
||||||
|
entries
|
||||||
|
.iter()
|
||||||
|
.for_each(|(word, count)| println!("{:016} {}", word, count));
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue