experiment: Add command to create word permutations
This creates candidate values to brute force dictionary entries with, by building combinations from a word list and delimiters.
This commit is contained in:
parent
94347d57f9
commit
6485dae27b
4 changed files with 277 additions and 2 deletions
35
Cargo.lock
generated
35
Cargo.lock
generated
|
@ -161,6 +161,17 @@ dependencies = [
|
|||
"system-deps",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi 0.1.19",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.3.0"
|
||||
|
@ -212,7 +223,7 @@ dependencies = [
|
|||
"bitflags 2.5.0",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"log",
|
||||
|
@ -927,6 +938,7 @@ name = "dtmt"
|
|||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"async-recursion",
|
||||
"atty",
|
||||
"clap",
|
||||
"cli-table",
|
||||
"color-eyre",
|
||||
|
@ -936,6 +948,7 @@ dependencies = [
|
|||
"futures",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"itertools 0.11.0",
|
||||
"luajit2-sys",
|
||||
"nanorand",
|
||||
"notify",
|
||||
|
@ -1598,6 +1611,15 @@ version = "0.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.9"
|
||||
|
@ -1858,6 +1880,15 @@ version = "1.70.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.12.1"
|
||||
|
@ -2267,7 +2298,7 @@ version = "1.16.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"hermit-abi 0.3.9",
|
||||
"libc",
|
||||
]
|
||||
|
||||
|
|
|
@ -33,6 +33,8 @@ async-recursion = "1.0.2"
|
|||
notify = "6.1.1"
|
||||
luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
|
||||
shlex = { version = "1.2.0", optional = true }
|
||||
atty = "0.2.14"
|
||||
itertools = "0.11.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.3.0"
|
||||
|
|
239
crates/dtmt/src/cmd/experiment/brute_force_words.rs
Normal file
239
crates/dtmt/src/cmd/experiment/brute_force_words.rs
Normal file
|
@ -0,0 +1,239 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use color_eyre::eyre::{self, Context};
|
||||
use color_eyre::Result;
|
||||
use itertools::Itertools;
|
||||
use tokio::fs;
|
||||
|
||||
pub(crate) fn command_definition() -> Command {
|
||||
Command::new("brute-force-words")
|
||||
.about(
|
||||
"Given a list of words and a set of delimiters, iteratevily creates permutations \
|
||||
of growing length.\n\
|
||||
Delimiters are placed between every word in the result.\n\n\
|
||||
Example: \
|
||||
Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \
|
||||
words will be\n\
|
||||
- packages\n\
|
||||
- boot\n\
|
||||
- packages/packages\n\
|
||||
- packages_packages\n\
|
||||
- packages/boot\n\
|
||||
- packages_boot\n\
|
||||
- boot/packages\n\
|
||||
- boot_packages\n\
|
||||
- boot/boot\n\
|
||||
- boot_boot",
|
||||
)
|
||||
.arg(
|
||||
Arg::new("delimiter")
|
||||
.help(
|
||||
"The delimiters to put between the words. \
|
||||
All permutations of this list will be tried for every string of words.\n\
|
||||
Specify multiple times to set multiple values.\n\
|
||||
Defaults to ['/', '_'].",
|
||||
)
|
||||
.short('d')
|
||||
.long("delimiter")
|
||||
.action(ArgAction::Append),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("max-length")
|
||||
.help("The maximum number of words up to which to build strings.")
|
||||
.long("max")
|
||||
.long("max-length")
|
||||
.short('m')
|
||||
.default_value("5")
|
||||
.value_parser(value_parser!(usize)),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("continue")
|
||||
.help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.")
|
||||
.short('c')
|
||||
.long("continue")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("words")
|
||||
.help("Path to a file containing words line by line.")
|
||||
.required(true)
|
||||
.value_parser(value_parser!(PathBuf)),
|
||||
)
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
#[allow(clippy::mut_range_bound)]
|
||||
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||
let max_length: usize = matches
|
||||
.get_one::<usize>("max-length")
|
||||
.copied()
|
||||
.expect("parameter has default");
|
||||
|
||||
let words: Vec<String> = {
|
||||
let path = matches
|
||||
.get_one::<PathBuf>("words")
|
||||
.expect("missing required parameter");
|
||||
|
||||
let file = fs::read_to_string(&path)
|
||||
.await
|
||||
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
|
||||
|
||||
file.lines().map(str::to_string).collect()
|
||||
};
|
||||
|
||||
if words.is_empty() {
|
||||
eyre::bail!("Word list must not be empty");
|
||||
}
|
||||
|
||||
let mut delimiters: Vec<String> = matches
|
||||
.get_many::<String>("delimiter")
|
||||
.unwrap_or_default()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
if delimiters.is_empty() {
|
||||
delimiters.push(String::from("/"));
|
||||
delimiters.push(String::from("_"));
|
||||
}
|
||||
|
||||
let delimiters_len = delimiters.len();
|
||||
|
||||
let word_count = words.len();
|
||||
tracing::info!("{} words to try", word_count);
|
||||
|
||||
// To be able to easily combine the permutations of words and delimiters,
|
||||
// we turn the latter into a pre-defined list of all permutations of delimiters
|
||||
// that are possible at the given amount of words.
|
||||
// Combining `Iterator::cycle` with `Itertools::permutations` works, but
|
||||
// with a high `max_length`, it runs OOM.
|
||||
// So we basically have to implement a smaller version of the iterative algorithm we use later on
|
||||
// to build permutations of the actual words.
|
||||
let delimiter_lists = {
|
||||
let mut indices = vec![0; max_length - 1];
|
||||
let mut list = Vec::new();
|
||||
|
||||
for _ in 0..delimiters_len.pow(max_length as u32 - 1) {
|
||||
list.push(indices.iter().map(|i| &delimiters[*i]).collect::<Vec<_>>());
|
||||
|
||||
for v in indices.iter_mut() {
|
||||
if *v >= delimiters_len - 1 {
|
||||
*v = 0;
|
||||
break;
|
||||
} else {
|
||||
*v += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
list
|
||||
};
|
||||
|
||||
tracing::debug!("{:?}", delimiter_lists);
|
||||
|
||||
let mut count = 0u64;
|
||||
|
||||
let mut indices = if let Some(cont) = matches.get_one::<String>("continue").cloned() {
|
||||
let mut splits = vec![cont.clone()];
|
||||
|
||||
for delim in delimiters.iter() {
|
||||
splits = splits
|
||||
.iter()
|
||||
.flat_map(|s| s.split(delim))
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
}
|
||||
|
||||
let indices = splits
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find(|(_, v)| s == **v)
|
||||
.map(|(i, _)| i)
|
||||
.ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
|
||||
tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
|
||||
|
||||
indices
|
||||
} else {
|
||||
vec![0]
|
||||
};
|
||||
let mut indices_len = indices.len();
|
||||
let mut sequence = indices
|
||||
.iter()
|
||||
.map(|index| words[*index].as_str())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Prevent re-allocation by reserving as much as we need upfront
|
||||
indices.reserve(max_length);
|
||||
sequence.reserve(max_length);
|
||||
|
||||
'outer: loop {
|
||||
// We only want delimiters between words, so we keep that iterator shorter by
|
||||
// one.
|
||||
let delimiter_count = sequence.len() as u32 - 1;
|
||||
|
||||
tracing::trace!(
|
||||
"{} | {:?} -> {:?}",
|
||||
delimiters_len.pow(delimiter_count),
|
||||
indices,
|
||||
sequence
|
||||
);
|
||||
|
||||
for delims in delimiter_lists
|
||||
.iter()
|
||||
.take(delimiters_len.pow(delimiter_count))
|
||||
{
|
||||
let delims = delims
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.take(delimiter_count as usize);
|
||||
let s: String = sequence
|
||||
.iter()
|
||||
.copied()
|
||||
.interleave(delims)
|
||||
.flat_map(|word| word.chars())
|
||||
.collect();
|
||||
|
||||
count = count.wrapping_add(1);
|
||||
|
||||
if count % 500000 == 0 {
|
||||
tracing::info!("{} words generated", count);
|
||||
}
|
||||
|
||||
println!("{}", s);
|
||||
}
|
||||
|
||||
for i in 0..indices_len {
|
||||
let index = indices.get_mut(i).unwrap();
|
||||
let word = sequence.get_mut(i).unwrap();
|
||||
|
||||
if *index >= word_count - 1 {
|
||||
*index = 0;
|
||||
*word = words[*index].as_str();
|
||||
|
||||
if indices.get(i + 1).is_none() {
|
||||
indices.push(0);
|
||||
sequence.push(words[0].as_str());
|
||||
|
||||
indices_len += 1;
|
||||
|
||||
if indices_len > max_length {
|
||||
break 'outer;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
*index += 1;
|
||||
*word = words[*index].as_str();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,18 +1,21 @@
|
|||
use clap::{ArgMatches, Command};
|
||||
use color_eyre::Result;
|
||||
|
||||
mod brute_force_words;
|
||||
mod extract_words;
|
||||
|
||||
pub(crate) fn command_definition() -> Command {
|
||||
Command::new("experiment")
|
||||
.subcommand_required(true)
|
||||
.about("A collection of utilities and experiments.")
|
||||
.subcommand(brute_force_words::command_definition())
|
||||
.subcommand(extract_words::command_definition())
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
|
||||
match matches.subcommand() {
|
||||
Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await,
|
||||
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
|
||||
_ => unreachable!(
|
||||
"clap is configured to require a subcommand, and they're all handled above"
|
||||
|
|
Loading…
Add table
Reference in a new issue