dtmt: Add command to extract words from file
As part of trying to brute force values for the dictionary, this allows extracting candidate words from a file.
This commit is contained in:
parent
2daff544a5
commit
94347d57f9
2 changed files with 186 additions and 0 deletions
182
crates/dtmt/src/cmd/experiment/extract_words.rs
Normal file
182
crates/dtmt/src/cmd/experiment/extract_words.rs
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
|
||||||
|
use color_eyre::eyre::Context;
|
||||||
|
use color_eyre::Result;
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
|
pub(crate) fn command_definition() -> Command {
|
||||||
|
Command::new("extract-words")
|
||||||
|
.about(
|
||||||
|
"Extract unique alphanumeric sequences that match common identifier rules from the given file. \
|
||||||
|
Only ASCII is supported.",
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("file")
|
||||||
|
.required(true)
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.help("Path to the file to extract words from."),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("min-length")
|
||||||
|
.help("Minimum length to consider a word.")
|
||||||
|
.long("min-length")
|
||||||
|
.short('m')
|
||||||
|
.default_value("3")
|
||||||
|
.value_parser(value_parser!(usize))
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("algorithm")
|
||||||
|
.help("The algorithm to determine matching words")
|
||||||
|
.long("algorithm")
|
||||||
|
.short('a')
|
||||||
|
.default_value("identifier")
|
||||||
|
.value_parser(value_parser!(Algorithm))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug, ValueEnum)]
|
||||||
|
#[value(rename_all = "snake_case")]
|
||||||
|
enum Algorithm {
|
||||||
|
Alphabetic,
|
||||||
|
Alphanumeric,
|
||||||
|
Identifier,
|
||||||
|
Number,
|
||||||
|
Hash32,
|
||||||
|
Hash64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Algorithm {
|
||||||
|
fn is_start(&self, c: char) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::Alphabetic => c.is_ascii_alphabetic(),
|
||||||
|
Self::Alphanumeric => c.is_ascii_alphanumeric(),
|
||||||
|
Self::Identifier => c.is_ascii_alphabetic(),
|
||||||
|
Self::Number => c.is_numeric(),
|
||||||
|
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_body(&self, c: char) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::Alphabetic => c.is_ascii_alphabetic(),
|
||||||
|
Self::Alphanumeric => c.is_ascii_alphanumeric(),
|
||||||
|
Self::Identifier => c.is_ascii_alphanumeric(),
|
||||||
|
Self::Number => c.is_numeric(),
|
||||||
|
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_length(&self, len: usize) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::Alphabetic => true,
|
||||||
|
Self::Alphanumeric => true,
|
||||||
|
Self::Identifier => true,
|
||||||
|
Self::Number => true,
|
||||||
|
Self::Hash32 => len == 8,
|
||||||
|
Self::Hash64 => len == 16,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for Algorithm {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{}",
|
||||||
|
match self {
|
||||||
|
Algorithm::Alphabetic => "alphabetic",
|
||||||
|
Algorithm::Alphanumeric => "alphanumeric",
|
||||||
|
Algorithm::Identifier => "identifier",
|
||||||
|
Algorithm::Number => "number",
|
||||||
|
Algorithm::Hash32 => "hash32",
|
||||||
|
Algorithm::Hash64 => "hash64",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
enum State {
|
||||||
|
Begin,
|
||||||
|
NonWord,
|
||||||
|
Word,
|
||||||
|
End,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(skip_all)]
|
||||||
|
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||||
|
let path = matches
|
||||||
|
.get_one::<PathBuf>("file")
|
||||||
|
.expect("missing required parameter");
|
||||||
|
|
||||||
|
let algorithm = matches
|
||||||
|
.get_one::<Algorithm>("algorithm")
|
||||||
|
.expect("parameter has default");
|
||||||
|
|
||||||
|
let min_length = matches
|
||||||
|
.get_one::<usize>("min-length")
|
||||||
|
.copied()
|
||||||
|
.expect("paramter has default");
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&path)
|
||||||
|
.await
|
||||||
|
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
||||||
|
let mut chars = content.chars();
|
||||||
|
|
||||||
|
let mut state = State::Begin;
|
||||||
|
let mut word = String::new();
|
||||||
|
let mut visited = HashSet::new();
|
||||||
|
|
||||||
|
'machine: loop {
|
||||||
|
state = match state {
|
||||||
|
State::Begin => match chars.next() {
|
||||||
|
None => State::End,
|
||||||
|
Some(c) if algorithm.is_start(c) => {
|
||||||
|
word.push(c);
|
||||||
|
State::Word
|
||||||
|
}
|
||||||
|
Some(_) => State::NonWord,
|
||||||
|
},
|
||||||
|
State::End => break 'machine,
|
||||||
|
State::NonWord => match chars.next() {
|
||||||
|
None => State::End,
|
||||||
|
Some(c) if algorithm.is_body(c) => {
|
||||||
|
word.push(c);
|
||||||
|
State::Word
|
||||||
|
}
|
||||||
|
Some(_) => State::NonWord,
|
||||||
|
},
|
||||||
|
State::Word => match chars.next() {
|
||||||
|
None => {
|
||||||
|
if word.len() >= min_length
|
||||||
|
&& algorithm.is_length(word.len())
|
||||||
|
&& !visited.contains(&word)
|
||||||
|
{
|
||||||
|
println!("{}", &word);
|
||||||
|
visited.insert(word.clone());
|
||||||
|
}
|
||||||
|
State::End
|
||||||
|
}
|
||||||
|
Some(c) if algorithm.is_body(c) => {
|
||||||
|
word.push(c);
|
||||||
|
State::Word
|
||||||
|
}
|
||||||
|
Some(_) => {
|
||||||
|
if word.len() >= min_length
|
||||||
|
&& algorithm.is_length(word.len())
|
||||||
|
&& !visited.contains(&word)
|
||||||
|
{
|
||||||
|
println!("{}", &word);
|
||||||
|
visited.insert(word.clone());
|
||||||
|
}
|
||||||
|
word.clear();
|
||||||
|
State::NonWord
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -1,15 +1,19 @@
|
||||||
use clap::{ArgMatches, Command};
|
use clap::{ArgMatches, Command};
|
||||||
use color_eyre::Result;
|
use color_eyre::Result;
|
||||||
|
|
||||||
|
mod extract_words;
|
||||||
|
|
||||||
pub(crate) fn command_definition() -> Command {
|
pub(crate) fn command_definition() -> Command {
|
||||||
Command::new("experiment")
|
Command::new("experiment")
|
||||||
.subcommand_required(true)
|
.subcommand_required(true)
|
||||||
.about("A collection of utilities and experiments.")
|
.about("A collection of utilities and experiments.")
|
||||||
|
.subcommand(extract_words::command_definition())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||||
match matches.subcommand() {
|
match matches.subcommand() {
|
||||||
|
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
|
||||||
_ => unreachable!(
|
_ => unreachable!(
|
||||||
"clap is configured to require a subcommand, and they're all handled above"
|
"clap is configured to require a subcommand, and they're all handled above"
|
||||||
),
|
),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue