From 6485dae27bc152a6eef7533d548e2c8f871b6563 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Sat, 16 Sep 2023 19:03:04 +0200 Subject: [PATCH] experiment: Add command to create word permutations This creates candidate values to brute force dictionary entries with, by building combinations from a word list and delimiters. --- Cargo.lock | 35 ++- crates/dtmt/Cargo.toml | 2 + .../src/cmd/experiment/brute_force_words.rs | 239 ++++++++++++++++++ crates/dtmt/src/cmd/experiment/mod.rs | 3 + 4 files changed, 277 insertions(+), 2 deletions(-) create mode 100644 crates/dtmt/src/cmd/experiment/brute_force_words.rs diff --git a/Cargo.lock b/Cargo.lock index a251de9..dac07e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -161,6 +161,17 @@ dependencies = [ "system-deps", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.3.0" @@ -212,7 +223,7 @@ dependencies = [ "bitflags 2.5.0", "cexpr", "clang-sys", - "itertools", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -927,6 +938,7 @@ name = "dtmt" version = "0.3.0" dependencies = [ "async-recursion", + "atty", "clap", "cli-table", "color-eyre", @@ -936,6 +948,7 @@ dependencies = [ "futures", "futures-util", "glob", + "itertools 0.11.0", "luajit2-sys", "nanorand", "notify", @@ -1598,6 +1611,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.9" @@ -1858,6 +1880,15 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -2267,7 +2298,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index d836a50..8018a8b 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -33,6 +33,8 @@ async-recursion = "1.0.2" notify = "6.1.1" luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } shlex = { version = "1.2.0", optional = true } +atty = "0.2.14" +itertools = "0.11.0" [dev-dependencies] tempfile = "3.3.0" diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs new file mode 100644 index 0000000..6bf81bb --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -0,0 +1,239 @@ +use std::path::PathBuf; + +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use color_eyre::eyre::{self, Context}; +use color_eyre::Result; +use itertools::Itertools; +use tokio::fs; + +pub(crate) fn command_definition() -> Command { + Command::new("brute-force-words") + .about( + "Given a list of words and a set of delimiters, iteratevily creates permutations \ + of growing length.\n\ + Delimiters are placed between every word in the result.\n\n\ + Example: \ + Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \ + words will be\n\ + - packages\n\ + - boot\n\ + - packages/packages\n\ + - packages_packages\n\ + - packages/boot\n\ + - packages_boot\n\ + - boot/packages\n\ + - boot_packages\n\ + - boot/boot\n\ + - boot_boot", + ) + .arg( + Arg::new("delimiter") + .help( + "The delimiters to put between the words. \ + All permutations of this list will be tried for every string of words.\n\ + Specify multiple times to set multiple values.\n\ + Defaults to ['/', '_'].", + ) + .short('d') + .long("delimiter") + .action(ArgAction::Append), + ) + .arg( + Arg::new("max-length") + .help("The maximum number of words up to which to build strings.") + .long("max") + .long("max-length") + .short('m') + .default_value("5") + .value_parser(value_parser!(usize)), + ) + .arg( + Arg::new("continue") + .help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.") + .short('c') + .long("continue") + ) + .arg( + Arg::new("words") + .help("Path to a file containing words line by line.") + .required(true) + .value_parser(value_parser!(PathBuf)), + ) +} + +#[tracing::instrument(skip_all)] +#[allow(clippy::mut_range_bound)] +pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let max_length: usize = matches + .get_one::("max-length") + .copied() + .expect("parameter has default"); + + let words: Vec = { + let path = matches + .get_one::("words") + .expect("missing required parameter"); + + let file = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + + file.lines().map(str::to_string).collect() + }; + + if words.is_empty() { + eyre::bail!("Word list must not be empty"); + } + + let mut delimiters: Vec = matches + .get_many::("delimiter") + .unwrap_or_default() + .cloned() + .collect(); + + if delimiters.is_empty() { + delimiters.push(String::from("/")); + delimiters.push(String::from("_")); + } + + let delimiters_len = delimiters.len(); + + let word_count = words.len(); + tracing::info!("{} words to try", word_count); + + // To be able to easily combine the permutations of words and delimiters, + // we turn the latter into a pre-defined list of all permutations of delimiters + // that are possible at the given amount of words. + // Combining `Iterator::cycle` with `Itertools::permutations` works, but + // with a high `max_length`, it runs OOM. + // So we basically have to implement a smaller version of the iterative algorithm we use later on + // to build permutations of the actual words. + let delimiter_lists = { + let mut indices = vec![0; max_length - 1]; + let mut list = Vec::new(); + + for _ in 0..delimiters_len.pow(max_length as u32 - 1) { + list.push(indices.iter().map(|i| &delimiters[*i]).collect::>()); + + for v in indices.iter_mut() { + if *v >= delimiters_len - 1 { + *v = 0; + break; + } else { + *v += 1; + } + } + } + + list + }; + + tracing::debug!("{:?}", delimiter_lists); + + let mut count = 0u64; + + let mut indices = if let Some(cont) = matches.get_one::("continue").cloned() { + let mut splits = vec![cont.clone()]; + + for delim in delimiters.iter() { + splits = splits + .iter() + .flat_map(|s| s.split(delim)) + .map(|s| s.to_string()) + .collect(); + } + + let indices = splits + .into_iter() + .map(|s| { + words + .iter() + .enumerate() + .find(|(_, v)| s == **v) + .map(|(i, _)| i) + .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) + }) + .collect::>()?; + + tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); + + indices + } else { + vec![0] + }; + let mut indices_len = indices.len(); + let mut sequence = indices + .iter() + .map(|index| words[*index].as_str()) + .collect::>(); + + // Prevent re-allocation by reserving as much as we need upfront + indices.reserve(max_length); + sequence.reserve(max_length); + + 'outer: loop { + // We only want delimiters between words, so we keep that iterator shorter by + // one. + let delimiter_count = sequence.len() as u32 - 1; + + tracing::trace!( + "{} | {:?} -> {:?}", + delimiters_len.pow(delimiter_count), + indices, + sequence + ); + + for delims in delimiter_lists + .iter() + .take(delimiters_len.pow(delimiter_count)) + { + let delims = delims + .iter() + .map(|s| s.as_str()) + .take(delimiter_count as usize); + let s: String = sequence + .iter() + .copied() + .interleave(delims) + .flat_map(|word| word.chars()) + .collect(); + + count = count.wrapping_add(1); + + if count % 500000 == 0 { + tracing::info!("{} words generated", count); + } + + println!("{}", s); + } + + for i in 0..indices_len { + let index = indices.get_mut(i).unwrap(); + let word = sequence.get_mut(i).unwrap(); + + if *index >= word_count - 1 { + *index = 0; + *word = words[*index].as_str(); + + if indices.get(i + 1).is_none() { + indices.push(0); + sequence.push(words[0].as_str()); + + indices_len += 1; + + if indices_len > max_length { + break 'outer; + } + + break; + } + } else { + *index += 1; + *word = words[*index].as_str(); + break; + } + } + } + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs index 51e5fc7..9ceb3b9 100644 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -1,18 +1,21 @@ use clap::{ArgMatches, Command}; use color_eyre::Result; +mod brute_force_words; mod extract_words; pub(crate) fn command_definition() -> Command { Command::new("experiment") .subcommand_required(true) .about("A collection of utilities and experiments.") + .subcommand(brute_force_words::command_definition()) .subcommand(extract_words::command_definition()) } #[tracing::instrument(skip_all)] pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { match matches.subcommand() { + Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await, Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above"