experiment: Add command to create word permutations

This creates candidate values to brute force dictionary entries with,
by building combinations from a word list and delimiters.
This commit is contained in:
Lucas Schwiderski 2023-09-16 19:03:04 +02:00
parent 94347d57f9
commit 6485dae27b
Signed by: lucas
GPG key ID: AA12679AAA6DF4D8
4 changed files with 277 additions and 2 deletions

35
Cargo.lock generated
View file

@ -161,6 +161,17 @@ dependencies = [
"system-deps",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi 0.1.19",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.3.0"
@ -212,7 +223,7 @@ dependencies = [
"bitflags 2.5.0",
"cexpr",
"clang-sys",
"itertools",
"itertools 0.12.1",
"lazy_static",
"lazycell",
"log",
@ -927,6 +938,7 @@ name = "dtmt"
version = "0.3.0"
dependencies = [
"async-recursion",
"atty",
"clap",
"cli-table",
"color-eyre",
@ -936,6 +948,7 @@ dependencies = [
"futures",
"futures-util",
"glob",
"itertools 0.11.0",
"luajit2-sys",
"nanorand",
"notify",
@ -1598,6 +1611,15 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.9"
@ -1858,6 +1880,15 @@ version = "1.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
@ -2267,7 +2298,7 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.9",
"libc",
]

View file

@ -33,6 +33,8 @@ async-recursion = "1.0.2"
notify = "6.1.1"
luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
shlex = { version = "1.2.0", optional = true }
atty = "0.2.14"
itertools = "0.11.0"
[dev-dependencies]
tempfile = "3.3.0"

View file

@ -0,0 +1,239 @@
use std::path::PathBuf;
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use color_eyre::eyre::{self, Context};
use color_eyre::Result;
use itertools::Itertools;
use tokio::fs;
pub(crate) fn command_definition() -> Command {
Command::new("brute-force-words")
.about(
"Given a list of words and a set of delimiters, iteratevily creates permutations \
of growing length.\n\
Delimiters are placed between every word in the result.\n\n\
Example: \
Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \
words will be\n\
- packages\n\
- boot\n\
- packages/packages\n\
- packages_packages\n\
- packages/boot\n\
- packages_boot\n\
- boot/packages\n\
- boot_packages\n\
- boot/boot\n\
- boot_boot",
)
.arg(
Arg::new("delimiter")
.help(
"The delimiters to put between the words. \
All permutations of this list will be tried for every string of words.\n\
Specify multiple times to set multiple values.\n\
Defaults to ['/', '_'].",
)
.short('d')
.long("delimiter")
.action(ArgAction::Append),
)
.arg(
Arg::new("max-length")
.help("The maximum number of words up to which to build strings.")
.long("max")
.long("max-length")
.short('m')
.default_value("5")
.value_parser(value_parser!(usize)),
)
.arg(
Arg::new("continue")
.help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.")
.short('c')
.long("continue")
)
.arg(
Arg::new("words")
.help("Path to a file containing words line by line.")
.required(true)
.value_parser(value_parser!(PathBuf)),
)
}
#[tracing::instrument(skip_all)]
#[allow(clippy::mut_range_bound)]
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
let max_length: usize = matches
.get_one::<usize>("max-length")
.copied()
.expect("parameter has default");
let words: Vec<String> = {
let path = matches
.get_one::<PathBuf>("words")
.expect("missing required parameter");
let file = fs::read_to_string(&path)
.await
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
file.lines().map(str::to_string).collect()
};
if words.is_empty() {
eyre::bail!("Word list must not be empty");
}
let mut delimiters: Vec<String> = matches
.get_many::<String>("delimiter")
.unwrap_or_default()
.cloned()
.collect();
if delimiters.is_empty() {
delimiters.push(String::from("/"));
delimiters.push(String::from("_"));
}
let delimiters_len = delimiters.len();
let word_count = words.len();
tracing::info!("{} words to try", word_count);
// To be able to easily combine the permutations of words and delimiters,
// we turn the latter into a pre-defined list of all permutations of delimiters
// that are possible at the given amount of words.
// Combining `Iterator::cycle` with `Itertools::permutations` works, but
// with a high `max_length`, it runs OOM.
// So we basically have to implement a smaller version of the iterative algorithm we use later on
// to build permutations of the actual words.
let delimiter_lists = {
let mut indices = vec![0; max_length - 1];
let mut list = Vec::new();
for _ in 0..delimiters_len.pow(max_length as u32 - 1) {
list.push(indices.iter().map(|i| &delimiters[*i]).collect::<Vec<_>>());
for v in indices.iter_mut() {
if *v >= delimiters_len - 1 {
*v = 0;
break;
} else {
*v += 1;
}
}
}
list
};
tracing::debug!("{:?}", delimiter_lists);
let mut count = 0u64;
let mut indices = if let Some(cont) = matches.get_one::<String>("continue").cloned() {
let mut splits = vec![cont.clone()];
for delim in delimiters.iter() {
splits = splits
.iter()
.flat_map(|s| s.split(delim))
.map(|s| s.to_string())
.collect();
}
let indices = splits
.into_iter()
.map(|s| {
words
.iter()
.enumerate()
.find(|(_, v)| s == **v)
.map(|(i, _)| i)
.ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
})
.collect::<Result<_>>()?;
tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
indices
} else {
vec![0]
};
let mut indices_len = indices.len();
let mut sequence = indices
.iter()
.map(|index| words[*index].as_str())
.collect::<Vec<_>>();
// Prevent re-allocation by reserving as much as we need upfront
indices.reserve(max_length);
sequence.reserve(max_length);
'outer: loop {
// We only want delimiters between words, so we keep that iterator shorter by
// one.
let delimiter_count = sequence.len() as u32 - 1;
tracing::trace!(
"{} | {:?} -> {:?}",
delimiters_len.pow(delimiter_count),
indices,
sequence
);
for delims in delimiter_lists
.iter()
.take(delimiters_len.pow(delimiter_count))
{
let delims = delims
.iter()
.map(|s| s.as_str())
.take(delimiter_count as usize);
let s: String = sequence
.iter()
.copied()
.interleave(delims)
.flat_map(|word| word.chars())
.collect();
count = count.wrapping_add(1);
if count % 500000 == 0 {
tracing::info!("{} words generated", count);
}
println!("{}", s);
}
for i in 0..indices_len {
let index = indices.get_mut(i).unwrap();
let word = sequence.get_mut(i).unwrap();
if *index >= word_count - 1 {
*index = 0;
*word = words[*index].as_str();
if indices.get(i + 1).is_none() {
indices.push(0);
sequence.push(words[0].as_str());
indices_len += 1;
if indices_len > max_length {
break 'outer;
}
break;
}
} else {
*index += 1;
*word = words[*index].as_str();
break;
}
}
}
Ok(())
}

View file

@ -1,18 +1,21 @@
use clap::{ArgMatches, Command};
use color_eyre::Result;
mod brute_force_words;
mod extract_words;
pub(crate) fn command_definition() -> Command {
Command::new("experiment")
.subcommand_required(true)
.about("A collection of utilities and experiments.")
.subcommand(brute_force_words::command_definition())
.subcommand(extract_words::command_definition())
}
#[tracing::instrument(skip_all)]
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
match matches.subcommand() {
Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await,
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
_ => unreachable!(
"clap is configured to require a subcommand, and they're all handled above"