dtmt: Add command to extract words from file
As part of trying to brute force values for the dictionary, this allows extracting candidate words from a file.
This commit is contained in:
parent
2daff544a5
commit
94347d57f9
2 changed files with 186 additions and 0 deletions
182
crates/dtmt/src/cmd/experiment/extract_words.rs
Normal file
182
crates/dtmt/src/cmd/experiment/extract_words.rs
Normal file
|
@ -0,0 +1,182 @@
|
|||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
|
||||
use color_eyre::eyre::Context;
|
||||
use color_eyre::Result;
|
||||
use tokio::fs;
|
||||
|
||||
pub(crate) fn command_definition() -> Command {
|
||||
Command::new("extract-words")
|
||||
.about(
|
||||
"Extract unique alphanumeric sequences that match common identifier rules from the given file. \
|
||||
Only ASCII is supported.",
|
||||
)
|
||||
.arg(
|
||||
Arg::new("file")
|
||||
.required(true)
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.help("Path to the file to extract words from."),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("min-length")
|
||||
.help("Minimum length to consider a word.")
|
||||
.long("min-length")
|
||||
.short('m')
|
||||
.default_value("3")
|
||||
.value_parser(value_parser!(usize))
|
||||
)
|
||||
.arg(
|
||||
Arg::new("algorithm")
|
||||
.help("The algorithm to determine matching words")
|
||||
.long("algorithm")
|
||||
.short('a')
|
||||
.default_value("identifier")
|
||||
.value_parser(value_parser!(Algorithm))
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, ValueEnum)]
|
||||
#[value(rename_all = "snake_case")]
|
||||
enum Algorithm {
|
||||
Alphabetic,
|
||||
Alphanumeric,
|
||||
Identifier,
|
||||
Number,
|
||||
Hash32,
|
||||
Hash64,
|
||||
}
|
||||
|
||||
impl Algorithm {
|
||||
fn is_start(&self, c: char) -> bool {
|
||||
match self {
|
||||
Self::Alphabetic => c.is_ascii_alphabetic(),
|
||||
Self::Alphanumeric => c.is_ascii_alphanumeric(),
|
||||
Self::Identifier => c.is_ascii_alphabetic(),
|
||||
Self::Number => c.is_numeric(),
|
||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_body(&self, c: char) -> bool {
|
||||
match self {
|
||||
Self::Alphabetic => c.is_ascii_alphabetic(),
|
||||
Self::Alphanumeric => c.is_ascii_alphanumeric(),
|
||||
Self::Identifier => c.is_ascii_alphanumeric(),
|
||||
Self::Number => c.is_numeric(),
|
||||
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_length(&self, len: usize) -> bool {
|
||||
match self {
|
||||
Self::Alphabetic => true,
|
||||
Self::Alphanumeric => true,
|
||||
Self::Identifier => true,
|
||||
Self::Number => true,
|
||||
Self::Hash32 => len == 8,
|
||||
Self::Hash64 => len == 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Algorithm {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
match self {
|
||||
Algorithm::Alphabetic => "alphabetic",
|
||||
Algorithm::Alphanumeric => "alphanumeric",
|
||||
Algorithm::Identifier => "identifier",
|
||||
Algorithm::Number => "number",
|
||||
Algorithm::Hash32 => "hash32",
|
||||
Algorithm::Hash64 => "hash64",
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
enum State {
|
||||
Begin,
|
||||
NonWord,
|
||||
Word,
|
||||
End,
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||
let path = matches
|
||||
.get_one::<PathBuf>("file")
|
||||
.expect("missing required parameter");
|
||||
|
||||
let algorithm = matches
|
||||
.get_one::<Algorithm>("algorithm")
|
||||
.expect("parameter has default");
|
||||
|
||||
let min_length = matches
|
||||
.get_one::<usize>("min-length")
|
||||
.copied()
|
||||
.expect("paramter has default");
|
||||
|
||||
let content = fs::read_to_string(&path)
|
||||
.await
|
||||
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
||||
let mut chars = content.chars();
|
||||
|
||||
let mut state = State::Begin;
|
||||
let mut word = String::new();
|
||||
let mut visited = HashSet::new();
|
||||
|
||||
'machine: loop {
|
||||
state = match state {
|
||||
State::Begin => match chars.next() {
|
||||
None => State::End,
|
||||
Some(c) if algorithm.is_start(c) => {
|
||||
word.push(c);
|
||||
State::Word
|
||||
}
|
||||
Some(_) => State::NonWord,
|
||||
},
|
||||
State::End => break 'machine,
|
||||
State::NonWord => match chars.next() {
|
||||
None => State::End,
|
||||
Some(c) if algorithm.is_body(c) => {
|
||||
word.push(c);
|
||||
State::Word
|
||||
}
|
||||
Some(_) => State::NonWord,
|
||||
},
|
||||
State::Word => match chars.next() {
|
||||
None => {
|
||||
if word.len() >= min_length
|
||||
&& algorithm.is_length(word.len())
|
||||
&& !visited.contains(&word)
|
||||
{
|
||||
println!("{}", &word);
|
||||
visited.insert(word.clone());
|
||||
}
|
||||
State::End
|
||||
}
|
||||
Some(c) if algorithm.is_body(c) => {
|
||||
word.push(c);
|
||||
State::Word
|
||||
}
|
||||
Some(_) => {
|
||||
if word.len() >= min_length
|
||||
&& algorithm.is_length(word.len())
|
||||
&& !visited.contains(&word)
|
||||
{
|
||||
println!("{}", &word);
|
||||
visited.insert(word.clone());
|
||||
}
|
||||
word.clear();
|
||||
State::NonWord
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,15 +1,19 @@
|
|||
use clap::{ArgMatches, Command};
|
||||
use color_eyre::Result;
|
||||
|
||||
mod extract_words;
|
||||
|
||||
pub(crate) fn command_definition() -> Command {
|
||||
Command::new("experiment")
|
||||
.subcommand_required(true)
|
||||
.about("A collection of utilities and experiments.")
|
||||
.subcommand(extract_words::command_definition())
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||
match matches.subcommand() {
|
||||
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
|
||||
_ => unreachable!(
|
||||
"clap is configured to require a subcommand, and they're all handled above"
|
||||
),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue