dtmt: Add command to extract words from file

As part of trying to brute force values for the dictionary,
this allows extracting candidate words from a file.
This commit is contained in:
Lucas Schwiderski 2023-09-16 18:43:52 +02:00
parent 2daff544a5
commit 94347d57f9
Signed by: lucas
GPG key ID: AA12679AAA6DF4D8
2 changed files with 186 additions and 0 deletions

View file

@ -0,0 +1,182 @@
use std::collections::HashSet;
use std::path::PathBuf;
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
use color_eyre::eyre::Context;
use color_eyre::Result;
use tokio::fs;
pub(crate) fn command_definition() -> Command {
Command::new("extract-words")
.about(
"Extract unique alphanumeric sequences that match common identifier rules from the given file. \
Only ASCII is supported.",
)
.arg(
Arg::new("file")
.required(true)
.value_parser(value_parser!(PathBuf))
.help("Path to the file to extract words from."),
)
.arg(
Arg::new("min-length")
.help("Minimum length to consider a word.")
.long("min-length")
.short('m')
.default_value("3")
.value_parser(value_parser!(usize))
)
.arg(
Arg::new("algorithm")
.help("The algorithm to determine matching words")
.long("algorithm")
.short('a')
.default_value("identifier")
.value_parser(value_parser!(Algorithm))
)
}
#[derive(Copy, Clone, Debug, ValueEnum)]
#[value(rename_all = "snake_case")]
enum Algorithm {
Alphabetic,
Alphanumeric,
Identifier,
Number,
Hash32,
Hash64,
}
impl Algorithm {
fn is_start(&self, c: char) -> bool {
match self {
Self::Alphabetic => c.is_ascii_alphabetic(),
Self::Alphanumeric => c.is_ascii_alphanumeric(),
Self::Identifier => c.is_ascii_alphabetic(),
Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
}
}
fn is_body(&self, c: char) -> bool {
match self {
Self::Alphabetic => c.is_ascii_alphabetic(),
Self::Alphanumeric => c.is_ascii_alphanumeric(),
Self::Identifier => c.is_ascii_alphanumeric(),
Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
}
}
fn is_length(&self, len: usize) -> bool {
match self {
Self::Alphabetic => true,
Self::Alphanumeric => true,
Self::Identifier => true,
Self::Number => true,
Self::Hash32 => len == 8,
Self::Hash64 => len == 16,
}
}
}
impl std::fmt::Display for Algorithm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
Algorithm::Alphabetic => "alphabetic",
Algorithm::Alphanumeric => "alphanumeric",
Algorithm::Identifier => "identifier",
Algorithm::Number => "number",
Algorithm::Hash32 => "hash32",
Algorithm::Hash64 => "hash64",
}
)
}
}
#[derive(Copy, Clone, Debug)]
enum State {
Begin,
NonWord,
Word,
End,
}
#[tracing::instrument(skip_all)]
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
let path = matches
.get_one::<PathBuf>("file")
.expect("missing required parameter");
let algorithm = matches
.get_one::<Algorithm>("algorithm")
.expect("parameter has default");
let min_length = matches
.get_one::<usize>("min-length")
.copied()
.expect("paramter has default");
let content = fs::read_to_string(&path)
.await
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
let mut chars = content.chars();
let mut state = State::Begin;
let mut word = String::new();
let mut visited = HashSet::new();
'machine: loop {
state = match state {
State::Begin => match chars.next() {
None => State::End,
Some(c) if algorithm.is_start(c) => {
word.push(c);
State::Word
}
Some(_) => State::NonWord,
},
State::End => break 'machine,
State::NonWord => match chars.next() {
None => State::End,
Some(c) if algorithm.is_body(c) => {
word.push(c);
State::Word
}
Some(_) => State::NonWord,
},
State::Word => match chars.next() {
None => {
if word.len() >= min_length
&& algorithm.is_length(word.len())
&& !visited.contains(&word)
{
println!("{}", &word);
visited.insert(word.clone());
}
State::End
}
Some(c) if algorithm.is_body(c) => {
word.push(c);
State::Word
}
Some(_) => {
if word.len() >= min_length
&& algorithm.is_length(word.len())
&& !visited.contains(&word)
{
println!("{}", &word);
visited.insert(word.clone());
}
word.clear();
State::NonWord
}
},
}
}
Ok(())
}

View file

@ -1,15 +1,19 @@
use clap::{ArgMatches, Command}; use clap::{ArgMatches, Command};
use color_eyre::Result; use color_eyre::Result;
mod extract_words;
pub(crate) fn command_definition() -> Command { pub(crate) fn command_definition() -> Command {
Command::new("experiment") Command::new("experiment")
.subcommand_required(true) .subcommand_required(true)
.about("A collection of utilities and experiments.") .about("A collection of utilities and experiments.")
.subcommand(extract_words::command_definition())
} }
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
match matches.subcommand() { match matches.subcommand() {
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
_ => unreachable!( _ => unreachable!(
"clap is configured to require a subcommand, and they're all handled above" "clap is configured to require a subcommand, and they're all handled above"
), ),