From 2daff544a55c3c4f95f50b6cb715fc4bcb73d5c1 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Wed, 17 Jul 2024 09:18:56 +0200 Subject: [PATCH 01/10] Add subcommand for experimental operations These may be temporary ones that help during analyzing and developing file formats, or or long term experiments. --- crates/dtmt/src/cmd/experiment/mod.rs | 17 +++++++++++++++++ crates/dtmt/src/main.rs | 3 +++ 2 files changed, 20 insertions(+) create mode 100644 crates/dtmt/src/cmd/experiment/mod.rs diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs new file mode 100644 index 0000000..b29f83a --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -0,0 +1,17 @@ +use clap::{ArgMatches, Command}; +use color_eyre::Result; + +pub(crate) fn command_definition() -> Command { + Command::new("experiment") + .subcommand_required(true) + .about("A collection of utilities and experiments.") +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + match matches.subcommand() { + _ => unreachable!( + "clap is configured to require a subcommand, and they're all handled above" + ), + } +} diff --git a/crates/dtmt/src/main.rs b/crates/dtmt/src/main.rs index 2e10b17..b01956a 100644 --- a/crates/dtmt/src/main.rs +++ b/crates/dtmt/src/main.rs @@ -21,6 +21,7 @@ mod cmd { pub mod build; pub mod bundle; pub mod dictionary; + pub mod experiment; pub mod migrate; pub mod murmur; pub mod new; @@ -56,6 +57,7 @@ async fn main() -> Result<()> { .subcommand(cmd::build::command_definition()) .subcommand(cmd::bundle::command_definition()) .subcommand(cmd::dictionary::command_definition()) + .subcommand(cmd::experiment::command_definition()) .subcommand(cmd::migrate::command_definition()) .subcommand(cmd::murmur::command_definition()) .subcommand(cmd::new::command_definition()) @@ -133,6 +135,7 @@ async fn main() -> Result<()> { Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?, Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?, Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?, + Some(("experiment", sub_matches)) => cmd::experiment::run(ctx, sub_matches).await?, Some(("migrate", sub_matches)) => cmd::migrate::run(ctx, sub_matches).await?, Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?, Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?, From 94347d57f9790ff61188a64192e98b602d891afb Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Sat, 16 Sep 2023 18:43:52 +0200 Subject: [PATCH 02/10] dtmt: Add command to extract words from file As part of trying to brute force values for the dictionary, this allows extracting candidate words from a file. --- .../dtmt/src/cmd/experiment/extract_words.rs | 182 ++++++++++++++++++ crates/dtmt/src/cmd/experiment/mod.rs | 4 + 2 files changed, 186 insertions(+) create mode 100644 crates/dtmt/src/cmd/experiment/extract_words.rs diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs new file mode 100644 index 0000000..512038d --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/extract_words.rs @@ -0,0 +1,182 @@ +use std::collections::HashSet; +use std::path::PathBuf; + +use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; +use color_eyre::eyre::Context; +use color_eyre::Result; +use tokio::fs; + +pub(crate) fn command_definition() -> Command { + Command::new("extract-words") + .about( + "Extract unique alphanumeric sequences that match common identifier rules from the given file. \ + Only ASCII is supported.", + ) + .arg( + Arg::new("file") + .required(true) + .value_parser(value_parser!(PathBuf)) + .help("Path to the file to extract words from."), + ) + .arg( + Arg::new("min-length") + .help("Minimum length to consider a word.") + .long("min-length") + .short('m') + .default_value("3") + .value_parser(value_parser!(usize)) + ) + .arg( + Arg::new("algorithm") + .help("The algorithm to determine matching words") + .long("algorithm") + .short('a') + .default_value("identifier") + .value_parser(value_parser!(Algorithm)) + ) +} + +#[derive(Copy, Clone, Debug, ValueEnum)] +#[value(rename_all = "snake_case")] +enum Algorithm { + Alphabetic, + Alphanumeric, + Identifier, + Number, + Hash32, + Hash64, +} + +impl Algorithm { + fn is_start(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphabetic(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } + + fn is_body(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphanumeric(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } + + fn is_length(&self, len: usize) -> bool { + match self { + Self::Alphabetic => true, + Self::Alphanumeric => true, + Self::Identifier => true, + Self::Number => true, + Self::Hash32 => len == 8, + Self::Hash64 => len == 16, + } + } +} + +impl std::fmt::Display for Algorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + Algorithm::Alphabetic => "alphabetic", + Algorithm::Alphanumeric => "alphanumeric", + Algorithm::Identifier => "identifier", + Algorithm::Number => "number", + Algorithm::Hash32 => "hash32", + Algorithm::Hash64 => "hash64", + } + ) + } +} + +#[derive(Copy, Clone, Debug)] +enum State { + Begin, + NonWord, + Word, + End, +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let path = matches + .get_one::("file") + .expect("missing required parameter"); + + let algorithm = matches + .get_one::("algorithm") + .expect("parameter has default"); + + let min_length = matches + .get_one::("min-length") + .copied() + .expect("paramter has default"); + + let content = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + let mut chars = content.chars(); + + let mut state = State::Begin; + let mut word = String::new(); + let mut visited = HashSet::new(); + + 'machine: loop { + state = match state { + State::Begin => match chars.next() { + None => State::End, + Some(c) if algorithm.is_start(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::End => break 'machine, + State::NonWord => match chars.next() { + None => State::End, + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::Word => match chars.next() { + None => { + if word.len() >= min_length + && algorithm.is_length(word.len()) + && !visited.contains(&word) + { + println!("{}", &word); + visited.insert(word.clone()); + } + State::End + } + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => { + if word.len() >= min_length + && algorithm.is_length(word.len()) + && !visited.contains(&word) + { + println!("{}", &word); + visited.insert(word.clone()); + } + word.clear(); + State::NonWord + } + }, + } + } + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs index b29f83a..51e5fc7 100644 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -1,15 +1,19 @@ use clap::{ArgMatches, Command}; use color_eyre::Result; +mod extract_words; + pub(crate) fn command_definition() -> Command { Command::new("experiment") .subcommand_required(true) .about("A collection of utilities and experiments.") + .subcommand(extract_words::command_definition()) } #[tracing::instrument(skip_all)] pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { match matches.subcommand() { + Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above" ), From 6485dae27bc152a6eef7533d548e2c8f871b6563 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Sat, 16 Sep 2023 19:03:04 +0200 Subject: [PATCH 03/10] experiment: Add command to create word permutations This creates candidate values to brute force dictionary entries with, by building combinations from a word list and delimiters. --- Cargo.lock | 35 ++- crates/dtmt/Cargo.toml | 2 + .../src/cmd/experiment/brute_force_words.rs | 239 ++++++++++++++++++ crates/dtmt/src/cmd/experiment/mod.rs | 3 + 4 files changed, 277 insertions(+), 2 deletions(-) create mode 100644 crates/dtmt/src/cmd/experiment/brute_force_words.rs diff --git a/Cargo.lock b/Cargo.lock index a251de9..dac07e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -161,6 +161,17 @@ dependencies = [ "system-deps", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.3.0" @@ -212,7 +223,7 @@ dependencies = [ "bitflags 2.5.0", "cexpr", "clang-sys", - "itertools", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -927,6 +938,7 @@ name = "dtmt" version = "0.3.0" dependencies = [ "async-recursion", + "atty", "clap", "cli-table", "color-eyre", @@ -936,6 +948,7 @@ dependencies = [ "futures", "futures-util", "glob", + "itertools 0.11.0", "luajit2-sys", "nanorand", "notify", @@ -1598,6 +1611,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.9" @@ -1858,6 +1880,15 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -2267,7 +2298,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index d836a50..8018a8b 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -33,6 +33,8 @@ async-recursion = "1.0.2" notify = "6.1.1" luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } shlex = { version = "1.2.0", optional = true } +atty = "0.2.14" +itertools = "0.11.0" [dev-dependencies] tempfile = "3.3.0" diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs new file mode 100644 index 0000000..6bf81bb --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -0,0 +1,239 @@ +use std::path::PathBuf; + +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use color_eyre::eyre::{self, Context}; +use color_eyre::Result; +use itertools::Itertools; +use tokio::fs; + +pub(crate) fn command_definition() -> Command { + Command::new("brute-force-words") + .about( + "Given a list of words and a set of delimiters, iteratevily creates permutations \ + of growing length.\n\ + Delimiters are placed between every word in the result.\n\n\ + Example: \ + Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \ + words will be\n\ + - packages\n\ + - boot\n\ + - packages/packages\n\ + - packages_packages\n\ + - packages/boot\n\ + - packages_boot\n\ + - boot/packages\n\ + - boot_packages\n\ + - boot/boot\n\ + - boot_boot", + ) + .arg( + Arg::new("delimiter") + .help( + "The delimiters to put between the words. \ + All permutations of this list will be tried for every string of words.\n\ + Specify multiple times to set multiple values.\n\ + Defaults to ['/', '_'].", + ) + .short('d') + .long("delimiter") + .action(ArgAction::Append), + ) + .arg( + Arg::new("max-length") + .help("The maximum number of words up to which to build strings.") + .long("max") + .long("max-length") + .short('m') + .default_value("5") + .value_parser(value_parser!(usize)), + ) + .arg( + Arg::new("continue") + .help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.") + .short('c') + .long("continue") + ) + .arg( + Arg::new("words") + .help("Path to a file containing words line by line.") + .required(true) + .value_parser(value_parser!(PathBuf)), + ) +} + +#[tracing::instrument(skip_all)] +#[allow(clippy::mut_range_bound)] +pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let max_length: usize = matches + .get_one::("max-length") + .copied() + .expect("parameter has default"); + + let words: Vec = { + let path = matches + .get_one::("words") + .expect("missing required parameter"); + + let file = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + + file.lines().map(str::to_string).collect() + }; + + if words.is_empty() { + eyre::bail!("Word list must not be empty"); + } + + let mut delimiters: Vec = matches + .get_many::("delimiter") + .unwrap_or_default() + .cloned() + .collect(); + + if delimiters.is_empty() { + delimiters.push(String::from("/")); + delimiters.push(String::from("_")); + } + + let delimiters_len = delimiters.len(); + + let word_count = words.len(); + tracing::info!("{} words to try", word_count); + + // To be able to easily combine the permutations of words and delimiters, + // we turn the latter into a pre-defined list of all permutations of delimiters + // that are possible at the given amount of words. + // Combining `Iterator::cycle` with `Itertools::permutations` works, but + // with a high `max_length`, it runs OOM. + // So we basically have to implement a smaller version of the iterative algorithm we use later on + // to build permutations of the actual words. + let delimiter_lists = { + let mut indices = vec![0; max_length - 1]; + let mut list = Vec::new(); + + for _ in 0..delimiters_len.pow(max_length as u32 - 1) { + list.push(indices.iter().map(|i| &delimiters[*i]).collect::>()); + + for v in indices.iter_mut() { + if *v >= delimiters_len - 1 { + *v = 0; + break; + } else { + *v += 1; + } + } + } + + list + }; + + tracing::debug!("{:?}", delimiter_lists); + + let mut count = 0u64; + + let mut indices = if let Some(cont) = matches.get_one::("continue").cloned() { + let mut splits = vec![cont.clone()]; + + for delim in delimiters.iter() { + splits = splits + .iter() + .flat_map(|s| s.split(delim)) + .map(|s| s.to_string()) + .collect(); + } + + let indices = splits + .into_iter() + .map(|s| { + words + .iter() + .enumerate() + .find(|(_, v)| s == **v) + .map(|(i, _)| i) + .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) + }) + .collect::>()?; + + tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); + + indices + } else { + vec![0] + }; + let mut indices_len = indices.len(); + let mut sequence = indices + .iter() + .map(|index| words[*index].as_str()) + .collect::>(); + + // Prevent re-allocation by reserving as much as we need upfront + indices.reserve(max_length); + sequence.reserve(max_length); + + 'outer: loop { + // We only want delimiters between words, so we keep that iterator shorter by + // one. + let delimiter_count = sequence.len() as u32 - 1; + + tracing::trace!( + "{} | {:?} -> {:?}", + delimiters_len.pow(delimiter_count), + indices, + sequence + ); + + for delims in delimiter_lists + .iter() + .take(delimiters_len.pow(delimiter_count)) + { + let delims = delims + .iter() + .map(|s| s.as_str()) + .take(delimiter_count as usize); + let s: String = sequence + .iter() + .copied() + .interleave(delims) + .flat_map(|word| word.chars()) + .collect(); + + count = count.wrapping_add(1); + + if count % 500000 == 0 { + tracing::info!("{} words generated", count); + } + + println!("{}", s); + } + + for i in 0..indices_len { + let index = indices.get_mut(i).unwrap(); + let word = sequence.get_mut(i).unwrap(); + + if *index >= word_count - 1 { + *index = 0; + *word = words[*index].as_str(); + + if indices.get(i + 1).is_none() { + indices.push(0); + sequence.push(words[0].as_str()); + + indices_len += 1; + + if indices_len > max_length { + break 'outer; + } + + break; + } + } else { + *index += 1; + *word = words[*index].as_str(); + break; + } + } + } + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs index 51e5fc7..9ceb3b9 100644 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -1,18 +1,21 @@ use clap::{ArgMatches, Command}; use color_eyre::Result; +mod brute_force_words; mod extract_words; pub(crate) fn command_definition() -> Command { Command::new("experiment") .subcommand_required(true) .about("A collection of utilities and experiments.") + .subcommand(brute_force_words::command_definition()) .subcommand(extract_words::command_definition()) } #[tracing::instrument(skip_all)] pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { match matches.subcommand() { + Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await, Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above" From 0d1193a12688567fc30b963b3d79c20d96bf55c4 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Mon, 18 Sep 2023 10:26:58 +0200 Subject: [PATCH 04/10] sdk: Improve word generation throughput It seems that the simple `println!()` is really bad when the goal is to write a lot of data to stdout. Presumably because it's unbuffered, but also because it required the preceding code to do a lot of allocations. This was replaced with a buffered writer on stdout, as well as an extra `Vec` that I can write everything to directly from the word and delimiter iterators, without allocating a single new structure. --- .../src/cmd/experiment/brute_force_words.rs | 76 ++++++++++++++++--- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index 6bf81bb..d2891f9 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -5,6 +5,7 @@ use color_eyre::eyre::{self, Context}; use color_eyre::Result; use itertools::Itertools; use tokio::fs; +use tokio::io::{AsyncWriteExt, BufWriter}; pub(crate) fn command_definition() -> Command { Command::new("brute-force-words") @@ -98,6 +99,38 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> let delimiters_len = delimiters.len(); + let prefixes = [ + "", + "content/characters/", + "content/debug/", + "content/decals/", + "content/environment/", + "content/fx/", + "content/gizmos/", + "content/items/", + "content/levels/", + "content/liquid_area/", + "content/localization/", + "content/materials/", + "content/minion_impact_assets/", + "content/pickups/", + "content/shading_environments/", + "content/textures/", + "content/ui/", + "content/videos/", + "content/vo/", + "content/volume_types/", + "content/weapons/", + "packages/boot_assets/", + "packages/content/", + "packages/game_scripts/", + "packages/strings/", + "packages/ui/", + "wwise/events/", + "wwise/packages/", + "wwise/world_sound_fx/", + ]; + let word_count = words.len(); tracing::info!("{} words to try", word_count); @@ -171,6 +204,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> indices.reserve(max_length); sequence.reserve(max_length); + let mut writer = BufWriter::new(tokio::io::stdout()); + let mut buf = Vec::with_capacity(1024); + + const LINE_FEED: u8 = 0x0A; + const UNDERSCORE: u8 = 0x5F; + const ZERO: u8 = 0x30; + 'outer: loop { // We only want delimiters between words, so we keep that iterator shorter by // one. @@ -191,20 +231,38 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> .iter() .map(|s| s.as_str()) .take(delimiter_count as usize); - let s: String = sequence - .iter() - .copied() - .interleave(delims) - .flat_map(|word| word.chars()) - .collect(); + let s = sequence.iter().copied().interleave(delims.clone()); count = count.wrapping_add(1); - if count % 500000 == 0 { - tracing::info!("{} words generated", count); + buf.clear(); + + for prefix in prefixes.iter() { + buf.extend_from_slice(prefix.as_bytes()); + s.clone() + .for_each(|word| buf.extend_from_slice(word.as_bytes())); + // buf.extend_from_slice(s.as_bytes()); + buf.push(LINE_FEED); + + for i in 0..=9 { + buf.extend_from_slice(prefix.as_bytes()); + s.clone() + .for_each(|word| buf.extend_from_slice(word.as_bytes())); + buf.push(UNDERSCORE); + buf.push(ZERO + i); + buf.push(LINE_FEED); + + buf.extend_from_slice(prefix.as_bytes()); + s.clone() + .for_each(|word| buf.extend_from_slice(word.as_bytes())); + buf.push(UNDERSCORE); + buf.push(ZERO); + buf.push(ZERO + i); + buf.push(LINE_FEED); + } } - println!("{}", s); + writer.write_all(&buf).await?; } for i in 0..indices_len { From 4480144d92db48f1f1515ef046984f941f14a89f Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Mon, 18 Sep 2023 13:29:42 +0200 Subject: [PATCH 05/10] sdk: Implement guessing a list of hashes While the approach to generate and store a list of strings does allow for this list to be re-used in the future, the I/O involved turned out to be quite costly. While the generation can run at up to 500 MiB/s, even compressing that on the fly doesn't reach fast enough write speeds on a HDD. And compression is also necessary to store this amount of data (generation reached two TB of raw data with a word length of just three, which is still 600 GB compressed). But compression also makes working with that data a lot harder. So this instead combines both the generation and search into a single step. The intermediate result of the generation is therefore lost, but the overall pipeline is much faster. --- .../src/cmd/experiment/brute_force_words.rs | 120 +++++++++++++++--- 1 file changed, 102 insertions(+), 18 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index d2891f9..bb3aa9e 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -1,11 +1,14 @@ +use std::collections::HashSet; use std::path::PathBuf; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use color_eyre::eyre::{self, Context}; use color_eyre::Result; use itertools::Itertools; +use sdk::murmur::Murmur64; use tokio::fs; -use tokio::io::{AsyncWriteExt, BufWriter}; +use tokio::io::AsyncWriteExt; +use tokio::time::Instant; pub(crate) fn command_definition() -> Command { Command::new("brute-force-words") @@ -60,6 +63,15 @@ pub(crate) fn command_definition() -> Command { .required(true) .value_parser(value_parser!(PathBuf)), ) + .arg( + Arg::new("hashes") + .help( + "Path to a file containing the hashes to attempt to brute force. \ + Hashes are expected in hexadecimal notiation. \ + Only 64-bit hashes are supported." + ) + .value_parser(value_parser!(PathBuf)), + ) } #[tracing::instrument(skip_all)] @@ -86,6 +98,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> eyre::bail!("Word list must not be empty"); } + let hashes = if let Some(path) = matches.get_one::("hashes") { + let content = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + + let hashes: Result, _> = content + .lines() + .map(|s| u64::from_str_radix(s, 16).map(Murmur64::from)) + .collect(); + + let hashes = hashes?; + + tracing::trace!("{:?}", hashes); + + Some(hashes) + } else { + None + }; + let mut delimiters: Vec = matches .get_many::("delimiter") .unwrap_or_default() @@ -163,8 +194,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> tracing::debug!("{:?}", delimiter_lists); - let mut count = 0u64; - let mut indices = if let Some(cont) = matches.get_one::("continue").cloned() { let mut splits = vec![cont.clone()]; @@ -204,7 +233,12 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> indices.reserve(max_length); sequence.reserve(max_length); - let mut writer = BufWriter::new(tokio::io::stdout()); + let mut count: usize = 0; + let mut found: usize = 0; + let mut start = Instant::now(); + + // let mut writer = BufWriter::new(tokio::io::stdout()); + let mut writer = tokio::io::stdout(); let mut buf = Vec::with_capacity(1024); const LINE_FEED: u8 = 0x0A; @@ -216,13 +250,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> // one. let delimiter_count = sequence.len() as u32 - 1; - tracing::trace!( - "{} | {:?} -> {:?}", - delimiters_len.pow(delimiter_count), - indices, - sequence - ); - for delims in delimiter_lists .iter() .take(delimiters_len.pow(delimiter_count)) @@ -233,16 +260,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> .take(delimiter_count as usize); let s = sequence.iter().copied().interleave(delims.clone()); - count = count.wrapping_add(1); - buf.clear(); for prefix in prefixes.iter() { buf.extend_from_slice(prefix.as_bytes()); s.clone() .for_each(|word| buf.extend_from_slice(word.as_bytes())); - // buf.extend_from_slice(s.as_bytes()); - buf.push(LINE_FEED); + + if let Some(hashes) = &hashes { + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + buf.push(LINE_FEED); + writer.write_all(&buf).await?; + } + + buf.clear(); + } else { + buf.push(LINE_FEED); + } for i in 0..=9 { buf.extend_from_slice(prefix.as_bytes()); @@ -250,7 +286,19 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> .for_each(|word| buf.extend_from_slice(word.as_bytes())); buf.push(UNDERSCORE); buf.push(ZERO + i); - buf.push(LINE_FEED); + + if let Some(hashes) = &hashes { + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + buf.push(LINE_FEED); + writer.write_all(&buf).await?; + } + + buf.clear(); + } else { + buf.push(LINE_FEED); + } buf.extend_from_slice(prefix.as_bytes()); s.clone() @@ -258,11 +306,47 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> buf.push(UNDERSCORE); buf.push(ZERO); buf.push(ZERO + i); - buf.push(LINE_FEED); + + if let Some(hashes) = &hashes { + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + buf.push(LINE_FEED); + writer.write_all(&buf).await?; + } + + buf.clear(); + } else { + buf.push(LINE_FEED); + } } } - writer.write_all(&buf).await?; + if let Some(hashes) = &hashes { + count += prefixes.len() * 20; + + let dur = Instant::now() - start; + if dur.as_secs() >= 1 { + let hashes_len = hashes.len(); + // Don't care when it finishes, don't care if it fails. + tokio::spawn(async move { + let _ = tokio::io::stderr() + .write_all( + format!( + "\r{} hashes per second, {}/{} found", + count, found, hashes_len + ) + .as_bytes(), + ) + .await; + }); + + start = Instant::now(); + count = 0; + } + } else { + writer.write_all(&buf).await?; + } } for i in 0..indices_len { From 951a7f82c0a2f1532df9a14ff622504a03bc20a7 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Tue, 19 Sep 2023 15:28:40 +0200 Subject: [PATCH 06/10] sdk: Improve word generation --- .../src/cmd/experiment/brute_force_words.rs | 164 +++++++++--------- 1 file changed, 79 insertions(+), 85 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index bb3aa9e..7e93dcc 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -70,6 +70,7 @@ pub(crate) fn command_definition() -> Command { Hashes are expected in hexadecimal notiation. \ Only 64-bit hashes are supported." ) + .required(true) .value_parser(value_parser!(PathBuf)), ) } @@ -98,7 +99,10 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> eyre::bail!("Word list must not be empty"); } - let hashes = if let Some(path) = matches.get_one::("hashes") { + let hashes = { + let path = matches + .get_one::("hashes") + .expect("missing required argument"); let content = fs::read_to_string(&path) .await .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; @@ -112,9 +116,7 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> tracing::trace!("{:?}", hashes); - Some(hashes) - } else { - None + hashes }; let mut delimiters: Vec = matches @@ -250,103 +252,95 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> // one. let delimiter_count = sequence.len() as u32 - 1; - for delims in delimiter_lists - .iter() - .take(delimiters_len.pow(delimiter_count)) - { - let delims = delims - .iter() - .map(|s| s.as_str()) - .take(delimiter_count as usize); - let s = sequence.iter().copied().interleave(delims.clone()); - + for prefix in prefixes.iter().map(|p| p.as_bytes()) { buf.clear(); - for prefix in prefixes.iter() { - buf.extend_from_slice(prefix.as_bytes()); - s.clone() + // We can keep the prefix at the front of the buffer and only + // replace the parts after that. + let prefix_len = prefix.len(); + buf.extend_from_slice(prefix); + + for delims in delimiter_lists + .iter() + .take(delimiters_len.pow(delimiter_count)) + { + buf.truncate(prefix_len); + + let delims = delims + .iter() + .map(|s| s.as_str()) + .take(delimiter_count as usize); + sequence + .iter() + .copied() + .interleave(delims.clone()) .for_each(|word| buf.extend_from_slice(word.as_bytes())); - if let Some(hashes) = &hashes { - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - buf.push(LINE_FEED); - writer.write_all(&buf).await?; - } + count += 1; - buf.clear(); - } else { + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; buf.push(LINE_FEED); - } + writer.write_all(&buf).await?; + } else { + let word_len = buf.len(); - for i in 0..=9 { - buf.extend_from_slice(prefix.as_bytes()); - s.clone() - .for_each(|word| buf.extend_from_slice(word.as_bytes())); - buf.push(UNDERSCORE); - buf.push(ZERO + i); + // If the regular word itself didn't match, we check + // for numbered suffixes. + // For now, we only check up to `09` to avoid more complex logic + // writing into the buffer. + // Packages that contain files with higher numbers than this + // should hopefully become easier to spot once a good number of + // hashes is found. + for i in 1..=9 { + buf.truncate(word_len); + buf.push(UNDERSCORE); + buf.push(ZERO); + buf.push(ZERO + i); + + count += 1; - if let Some(hashes) = &hashes { let hash = Murmur64::hash(&buf); if hashes.contains(&hash) { found += 1; buf.push(LINE_FEED); writer.write_all(&buf).await?; + } else { + break; } - - buf.clear(); - } else { - buf.push(LINE_FEED); - } - - buf.extend_from_slice(prefix.as_bytes()); - s.clone() - .for_each(|word| buf.extend_from_slice(word.as_bytes())); - buf.push(UNDERSCORE); - buf.push(ZERO); - buf.push(ZERO + i); - - if let Some(hashes) = &hashes { - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - buf.push(LINE_FEED); - writer.write_all(&buf).await?; - } - - buf.clear(); - } else { - buf.push(LINE_FEED); } } } + } - if let Some(hashes) = &hashes { - count += prefixes.len() * 20; + let dur = Instant::now() - start; + if dur.as_secs() >= 1 { + let hashes_len = hashes.len(); + let s = String::from_utf8_lossy(&buf); + // The last prefix in the set is the one that will stay in the buffer + // when we're about to print here. + // So we strip that, to show just the generated part. + // We also restrict the length to stay on a single line. + let prefix_len = prefixes[28].len(); + let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] + .trim_end() + .to_string(); + // Don't care when it finishes, don't care if it fails. + tokio::spawn(async move { + let _ = tokio::io::stderr() + .write_all( + format!( + "\r{:8} hashes per second | {:6}/{} found | {:<60}", + count, found, hashes_len, s + ) + .as_bytes(), + ) + .await; + }); - let dur = Instant::now() - start; - if dur.as_secs() >= 1 { - let hashes_len = hashes.len(); - // Don't care when it finishes, don't care if it fails. - tokio::spawn(async move { - let _ = tokio::io::stderr() - .write_all( - format!( - "\r{} hashes per second, {}/{} found", - count, found, hashes_len - ) - .as_bytes(), - ) - .await; - }); - - start = Instant::now(); - count = 0; - } - } else { - writer.write_all(&buf).await?; - } + start = Instant::now(); + count = 0; } for i in 0..indices_len { @@ -358,15 +352,15 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> *word = words[*index].as_str(); if indices.get(i + 1).is_none() { - indices.push(0); - sequence.push(words[0].as_str()); - indices_len += 1; if indices_len > max_length { break 'outer; } + indices.push(0); + sequence.push(words[0].as_str()); + break; } } else { From b366185a63f8182b1c9d8e32b16f738a42f7a912 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Tue, 19 Sep 2023 15:29:40 +0200 Subject: [PATCH 07/10] sdk: Implement worker pool for word generation Massive speed improvement. The index generation is really fast, and it appears that even worker numbers way higher than the core/thread count still increase the throughput slightly. The only missing part is the info output. That's broken, currently. --- Cargo.lock | 49 ++ crates/dtmt/Cargo.toml | 1 + .../src/cmd/experiment/brute_force_words.rs | 544 +++++++++++------- crates/dtmt/src/cmd/experiment/mod.rs | 4 +- 4 files changed, 401 insertions(+), 197 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dac07e9..3a02b55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -658,6 +658,20 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + [[package]] name = "crossbeam-channel" version = "0.5.12" @@ -667,6 +681,40 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset 0.9.1", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -943,6 +991,7 @@ dependencies = [ "cli-table", "color-eyre", "confy", + "crossbeam", "csv-async", "dtmt-shared", "futures", diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index 8018a8b..e80feaa 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -35,6 +35,7 @@ luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } shlex = { version = "1.2.0", optional = true } atty = "0.2.14" itertools = "0.11.0" +crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] } [dev-dependencies] tempfile = "3.3.0" diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index 7e93dcc..aa15003 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -1,13 +1,16 @@ use std::collections::HashSet; +use std::fs; +use std::io::Write; use std::path::PathBuf; +use std::sync::Arc; +use std::thread::JoinHandle; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use color_eyre::eyre::{self, Context}; use color_eyre::Result; +use crossbeam::channel::{bounded, unbounded, Receiver, Sender}; use itertools::Itertools; use sdk::murmur::Murmur64; -use tokio::fs; -use tokio::io::AsyncWriteExt; use tokio::time::Instant; pub(crate) fn command_definition() -> Command { @@ -57,6 +60,14 @@ pub(crate) fn command_definition() -> Command { .short('c') .long("continue") ) + .arg( + Arg::new("threads") + .help("The number of workers to run in parallel.") + .long("threads") + .short('n') + .default_value("6") + .value_parser(value_parser!(usize)) + ) .arg( Arg::new("words") .help("Path to a file containing words line by line.") @@ -75,36 +86,307 @@ pub(crate) fn command_definition() -> Command { ) } +const LINE_FEED: u8 = 0x0A; +const UNDERSCORE: u8 = 0x5F; +const ZERO: u8 = 0x30; + +const PREFIXES: [&str; 29] = [ + "", + "content/characters/", + "content/debug/", + "content/decals/", + "content/environment/", + "content/fx/", + "content/gizmos/", + "content/items/", + "content/levels/", + "content/liquid_area/", + "content/localization/", + "content/materials/", + "content/minion_impact_assets/", + "content/pickups/", + "content/shading_environments/", + "content/textures/", + "content/ui/", + "content/videos/", + "content/vo/", + "content/volume_types/", + "content/weapons/", + "packages/boot_assets/", + "packages/content/", + "packages/game_scripts/", + "packages/strings/", + "packages/ui/", + "wwise/events/", + "wwise/packages/", + "wwise/world_sound_fx/", +]; + +fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHandle<()> { + std::thread::spawn(move || { + let mut writer = std::io::stderr(); + let mut total_count = 0; + let mut total_found = 0; + + let start = Instant::now(); + + while let Ok((count, found)) = rx.recv() { + total_count += count; + total_found += found; + + let dur = Instant::now() - start; + if dur.as_secs() > 1 { + let s = format!("\r{total_count} per second | {total_found:6}/{hash_count} found",); + + // let s = String::from_utf8_lossy(&buf); + // // The last prefix in the set is the one that will stay in the buffer + // // when we're about to print here. + // // So we strip that, to show just the generated part. + // // We also restrict the length to stay on a single line. + // let prefix_len = prefixes[28].len(); + // let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] + // .trim_end() + // .to_string(); + + writer.write_all(s.as_bytes()).unwrap(); + + total_count = 0; + } + } + }) +} + +fn make_stdout_printer(rx: Receiver>) -> JoinHandle<()> { + std::thread::spawn(move || { + let mut writer = std::io::stdout(); + + while let Ok(buf) = rx.recv() { + writer.write_all(&buf).unwrap(); + } + }) +} + +struct State { + delimiter_lists: Arc>>, + hashes: Arc>, + words: Arc>, + delimiters_len: usize, + stdout_tx: Sender>, + info_tx: Sender<(usize, usize)>, +} + +fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { + std::thread::spawn(move || { + let delimiter_lists = &state.delimiter_lists; + let hashes = &state.hashes; + let words = &state.words; + let delimiters_len = state.delimiters_len; + + let mut count = 0; + let mut found = 0; + let mut buf = Vec::with_capacity(1024); + + // while let Some(indices) = find_task(local, global, &[]) { + while let Ok(indices) = rx.recv() { + let sequence = indices.iter().map(|i| words[*i].as_str()); + + // We only want delimiters between words, so we keep that iterator shorter by + // one. + let delimiter_count = sequence.len() as u32 - 1; + + for prefix in PREFIXES.iter().map(|p| p.as_bytes()) { + buf.clear(); + + // We can keep the prefix at the front of the buffer and only + // replace the parts after that. + let prefix_len = prefix.len(); + buf.extend_from_slice(prefix); + + for delims in delimiter_lists + .iter() + .take(delimiters_len.pow(delimiter_count)) + { + buf.truncate(prefix_len); + + let delims = delims + .iter() + .map(|s| s.as_str()) + .take(delimiter_count as usize); + sequence + .clone() + .interleave(delims.clone()) + .for_each(|word| buf.extend_from_slice(word.as_bytes())); + + count += 1; + + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + + buf.push(LINE_FEED); + if let Err(_) = state.stdout_tx.send(buf.clone()) { + return; + } + } else { + let word_len = buf.len(); + + // If the regular word itself didn't match, we check + // for numbered suffixes. + // For now, we only check up to `09` to avoid more complex logic + // writing into the buffer. + // Packages that contain files with higher numbers than this + // should hopefully become easier to spot once a good number of + // hashes is found. + for i in 1..=9 { + buf.truncate(word_len); + buf.push(UNDERSCORE); + buf.push(ZERO); + buf.push(ZERO + i); + + count += 1; + + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + + buf.push(LINE_FEED); + if let Err(_) = state.stdout_tx.send(buf.clone()) { + return; + } + } else { + break; + } + } + } + } + } + + if count >= 1024 * 1024 { + let _ = state.info_tx.send((count, found)); + } + + // let dur = Instant::now() - start; + // if dur.as_secs() >= 1 { + // let hashes_len = hashes.len(); + // let s = String::from_utf8_lossy(&buf); + // // The last prefix in the set is the one that will stay in the buffer + // // when we're about to print here. + // // So we strip that, to show just the generated part. + // // We also restrict the length to stay on a single line. + // let prefix_len = prefixes[28].len(); + // let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] + // .trim_end() + // .to_string(); + // info_tx.send(format!( + // "\r{:8} hashes per second | {:6}/{} found | {:<60}", + // count, found, hashes_len, s + // )); + + // start = Instant::now(); + // count = 0; + // } + } + }) +} + +fn build_delimiter_lists(delimiters: impl AsRef<[String]>, max_length: usize) -> Vec> { + let delimiters = delimiters.as_ref(); + let mut indices = vec![0; max_length]; + let mut list = Vec::new(); + + for _ in 0..delimiters.len().pow(max_length as u32) { + list.push( + indices + .iter() + .map(|i| delimiters[*i].clone()) + .collect::>(), + ); + + for v in indices.iter_mut() { + if *v >= delimiters.len() - 1 { + *v = 0; + break; + } else { + *v += 1; + } + } + } + + list +} + +fn build_initial_indices( + cont: Option<&String>, + delimiters: impl AsRef<[String]>, + words: impl AsRef<[String]>, +) -> Result> { + if let Some(cont) = cont { + let mut splits = vec![cont.clone()]; + + for delim in delimiters.as_ref().iter() { + splits = splits + .iter() + .flat_map(|s| s.split(delim)) + .map(|s| s.to_string()) + .collect(); + } + + let indices = splits + .into_iter() + .map(|s| { + words + .as_ref() + .iter() + .enumerate() + .find(|(_, v)| s == **v) + .map(|(i, _)| i) + .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) + }) + .collect::>()?; + + tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); + + Ok(indices) + } else { + Ok(vec![0]) + } +} + #[tracing::instrument(skip_all)] #[allow(clippy::mut_range_bound)] -pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { +pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { let max_length: usize = matches .get_one::("max-length") .copied() .expect("parameter has default"); - let words: Vec = { + let num_threads: usize = matches + .get_one::("threads") + .copied() + .expect("parameter has default"); + + let words = { let path = matches .get_one::("words") .expect("missing required parameter"); - let file = fs::read_to_string(&path) - .await + let file = fs::read_to_string(path) .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; - file.lines().map(str::to_string).collect() - }; + let words: Vec<_> = file.lines().map(str::to_string).collect(); - if words.is_empty() { - eyre::bail!("Word list must not be empty"); - } + if words.is_empty() { + eyre::bail!("Word list must not be empty"); + } + + Arc::new(words) + }; let hashes = { let path = matches .get_one::("hashes") .expect("missing required argument"); - let content = fs::read_to_string(&path) - .await + let content = fs::read_to_string(path) .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; let hashes: Result, _> = content @@ -116,7 +398,7 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> tracing::trace!("{:?}", hashes); - hashes + Arc::new(hashes) }; let mut delimiters: Vec = matches @@ -132,38 +414,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> let delimiters_len = delimiters.len(); - let prefixes = [ - "", - "content/characters/", - "content/debug/", - "content/decals/", - "content/environment/", - "content/fx/", - "content/gizmos/", - "content/items/", - "content/levels/", - "content/liquid_area/", - "content/localization/", - "content/materials/", - "content/minion_impact_assets/", - "content/pickups/", - "content/shading_environments/", - "content/textures/", - "content/ui/", - "content/videos/", - "content/vo/", - "content/volume_types/", - "content/weapons/", - "packages/boot_assets/", - "packages/content/", - "packages/game_scripts/", - "packages/strings/", - "packages/ui/", - "wwise/events/", - "wwise/packages/", - "wwise/world_sound_fx/", - ]; - let word_count = words.len(); tracing::info!("{} words to try", word_count); @@ -175,56 +425,43 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> // So we basically have to implement a smaller version of the iterative algorithm we use later on // to build permutations of the actual words. let delimiter_lists = { - let mut indices = vec![0; max_length - 1]; - let mut list = Vec::new(); - - for _ in 0..delimiters_len.pow(max_length as u32 - 1) { - list.push(indices.iter().map(|i| &delimiters[*i]).collect::>()); - - for v in indices.iter_mut() { - if *v >= delimiters_len - 1 { - *v = 0; - break; - } else { - *v += 1; - } - } - } - - list + let lists = build_delimiter_lists(&delimiters, max_length - 1); + Arc::new(lists) }; - tracing::debug!("{:?}", delimiter_lists); - let mut indices = if let Some(cont) = matches.get_one::("continue").cloned() { - let mut splits = vec![cont.clone()]; + let (info_tx, info_rx) = unbounded(); + let (stdout_tx, stdout_rx) = unbounded::>(); + let (task_tx, task_rx) = bounded::>(100); + let mut handles = Vec::new(); - for delim in delimiters.iter() { - splits = splits - .iter() - .flat_map(|s| s.split(delim)) - .map(|s| s.to_string()) - .collect(); - } + for _ in 0..num_threads { + let handle = make_worker( + task_rx.clone(), + State { + delimiter_lists: Arc::clone(&delimiter_lists), + hashes: Arc::clone(&hashes), + words: Arc::clone(&words), + delimiters_len, + stdout_tx: stdout_tx.clone(), + info_tx: info_tx.clone(), + }, + ); + handles.push(handle); + } + // These are only used inside the worker threads, but due to the loops above, we had to + // clone them one too many times. + // So we drop that extra reference immediately, to ensure that the channels can + // disconnect properly when the threads finish. + drop(stdout_tx); + drop(info_tx); - let indices = splits - .into_iter() - .map(|s| { - words - .iter() - .enumerate() - .find(|(_, v)| s == **v) - .map(|(i, _)| i) - .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) - }) - .collect::>()?; + // handles.push(make_info_printer(info_rx, hashes.len())); + handles.push(make_stdout_printer(stdout_rx)); - tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); - - indices - } else { - vec![0] - }; + let mut indices = + build_initial_indices(matches.get_one::("continue"), &delimiters, &*words) + .wrap_err("Failed to build initial indices")?; let mut indices_len = indices.len(); let mut sequence = indices .iter() @@ -235,113 +472,8 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> indices.reserve(max_length); sequence.reserve(max_length); - let mut count: usize = 0; - let mut found: usize = 0; - let mut start = Instant::now(); - - // let mut writer = BufWriter::new(tokio::io::stdout()); - let mut writer = tokio::io::stdout(); - let mut buf = Vec::with_capacity(1024); - - const LINE_FEED: u8 = 0x0A; - const UNDERSCORE: u8 = 0x5F; - const ZERO: u8 = 0x30; - 'outer: loop { - // We only want delimiters between words, so we keep that iterator shorter by - // one. - let delimiter_count = sequence.len() as u32 - 1; - - for prefix in prefixes.iter().map(|p| p.as_bytes()) { - buf.clear(); - - // We can keep the prefix at the front of the buffer and only - // replace the parts after that. - let prefix_len = prefix.len(); - buf.extend_from_slice(prefix); - - for delims in delimiter_lists - .iter() - .take(delimiters_len.pow(delimiter_count)) - { - buf.truncate(prefix_len); - - let delims = delims - .iter() - .map(|s| s.as_str()) - .take(delimiter_count as usize); - sequence - .iter() - .copied() - .interleave(delims.clone()) - .for_each(|word| buf.extend_from_slice(word.as_bytes())); - - count += 1; - - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - buf.push(LINE_FEED); - writer.write_all(&buf).await?; - } else { - let word_len = buf.len(); - - // If the regular word itself didn't match, we check - // for numbered suffixes. - // For now, we only check up to `09` to avoid more complex logic - // writing into the buffer. - // Packages that contain files with higher numbers than this - // should hopefully become easier to spot once a good number of - // hashes is found. - for i in 1..=9 { - buf.truncate(word_len); - buf.push(UNDERSCORE); - buf.push(ZERO); - buf.push(ZERO + i); - - count += 1; - - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - buf.push(LINE_FEED); - writer.write_all(&buf).await?; - } else { - break; - } - } - } - } - } - - let dur = Instant::now() - start; - if dur.as_secs() >= 1 { - let hashes_len = hashes.len(); - let s = String::from_utf8_lossy(&buf); - // The last prefix in the set is the one that will stay in the buffer - // when we're about to print here. - // So we strip that, to show just the generated part. - // We also restrict the length to stay on a single line. - let prefix_len = prefixes[28].len(); - let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] - .trim_end() - .to_string(); - // Don't care when it finishes, don't care if it fails. - tokio::spawn(async move { - let _ = tokio::io::stderr() - .write_all( - format!( - "\r{:8} hashes per second | {:6}/{} found | {:<60}", - count, found, hashes_len, s - ) - .as_bytes(), - ) - .await; - }); - - start = Instant::now(); - count = 0; - } + task_tx.send(indices.clone())?; for i in 0..indices_len { let index = indices.get_mut(i).unwrap(); @@ -371,5 +503,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> } } + // Dropping the senders will disconnect the channel, + // so that the threads holding the other end will eventually + // complete as well. + drop(task_tx); + + tracing::debug!("Wainting for workers to finish."); + + for handle in handles { + match handle.join() { + Ok(_) => {} + Err(value) => { + if let Some(err) = value.downcast_ref::() { + eyre::bail!("Thread failed: {}", err); + } else { + eyre::bail!("Thread failed with unknown error: {:?}", value); + } + } + } + } + Ok(()) } diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs index 9ceb3b9..c53d9b5 100644 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -15,7 +15,9 @@ pub(crate) fn command_definition() -> Command { #[tracing::instrument(skip_all)] pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { match matches.subcommand() { - Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await, + // It's fine to block here, as this is the only thing that's executing on the runtime. + // The other option with `spawn_blocking` would require setting up values to be Send+Sync. + Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches), Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, _ => unreachable!( "clap is configured to require a subcommand, and they're all handled above" From 64493547143f43b90e405a9ef98fe31066de93a9 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Tue, 19 Sep 2023 16:15:22 +0200 Subject: [PATCH 08/10] sdk: Reimplement logging current word --- .../src/cmd/experiment/brute_force_words.rs | 86 ++++++++----------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index aa15003..4bf8556 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; use std::fs; -use std::io::Write; +use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::sync::Arc; use std::thread::JoinHandle; @@ -122,35 +122,30 @@ const PREFIXES: [&str; 29] = [ "wwise/world_sound_fx/", ]; -fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHandle<()> { +fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> { std::thread::spawn(move || { let mut writer = std::io::stderr(); let mut total_count = 0; let mut total_found = 0; - let start = Instant::now(); + let mut start = Instant::now(); - while let Ok((count, found)) = rx.recv() { + while let Ok((count, found, last)) = rx.recv() { total_count += count; total_found += found; - let dur = Instant::now() - start; - if dur.as_secs() > 1 { - let s = format!("\r{total_count} per second | {total_found:6}/{hash_count} found",); - - // let s = String::from_utf8_lossy(&buf); - // // The last prefix in the set is the one that will stay in the buffer - // // when we're about to print here. - // // So we strip that, to show just the generated part. - // // We also restrict the length to stay on a single line. - // let prefix_len = prefixes[28].len(); - // let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] - // .trim_end() - // .to_string(); + let now = Instant::now(); + if (now - start).as_millis() > 250 { + let s = &last[0..std::cmp::min(last.len(), 60)]; + let s = format!( + "\r{:12} per second | {total_found:6}/{hash_count} found | {s:<60}", + total_count * 4 + ); writer.write_all(s.as_bytes()).unwrap(); total_count = 0; + start = now; } } }) @@ -158,7 +153,7 @@ fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHan fn make_stdout_printer(rx: Receiver>) -> JoinHandle<()> { std::thread::spawn(move || { - let mut writer = std::io::stdout(); + let mut writer = BufWriter::new(std::io::stdout()); while let Ok(buf) = rx.recv() { writer.write_all(&buf).unwrap(); @@ -172,7 +167,7 @@ struct State { words: Arc>, delimiters_len: usize, stdout_tx: Sender>, - info_tx: Sender<(usize, usize)>, + info_tx: Sender<(usize, usize, String)>, } fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { @@ -186,7 +181,6 @@ fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { let mut found = 0; let mut buf = Vec::with_capacity(1024); - // while let Some(indices) = find_task(local, global, &[]) { while let Ok(indices) = rx.recv() { let sequence = indices.iter().map(|i| words[*i].as_str()); @@ -224,7 +218,7 @@ fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { found += 1; buf.push(LINE_FEED); - if let Err(_) = state.stdout_tx.send(buf.clone()) { + if state.stdout_tx.send(buf.clone()).is_err() { return; } } else { @@ -250,7 +244,7 @@ fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { found += 1; buf.push(LINE_FEED); - if let Err(_) = state.stdout_tx.send(buf.clone()) { + if state.stdout_tx.send(buf.clone()).is_err() { return; } } else { @@ -261,30 +255,22 @@ fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { } } - if count >= 1024 * 1024 { - let _ = state.info_tx.send((count, found)); + if count >= 2 * 1024 * 1024 { + // The last prefix in the set is the one that will stay in the buffer + // when we're about to print here. + // So we strip that, to show just the generated part. + // We also restrict the length to stay on a single line. + let prefix_len = PREFIXES[28].len(); + // No need to wait for this + let _ = state.info_tx.try_send(( + count, + found, + String::from_utf8_lossy(&buf[prefix_len..]).to_string(), + )); + + count = 0; + found = 0; } - - // let dur = Instant::now() - start; - // if dur.as_secs() >= 1 { - // let hashes_len = hashes.len(); - // let s = String::from_utf8_lossy(&buf); - // // The last prefix in the set is the one that will stay in the buffer - // // when we're about to print here. - // // So we strip that, to show just the generated part. - // // We also restrict the length to stay on a single line. - // let prefix_len = prefixes[28].len(); - // let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)] - // .trim_end() - // .to_string(); - // info_tx.send(format!( - // "\r{:8} hashes per second | {:6}/{} found | {:<60}", - // count, found, hashes_len, s - // )); - - // start = Instant::now(); - // count = 0; - // } } }) } @@ -430,9 +416,9 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { }; tracing::debug!("{:?}", delimiter_lists); - let (info_tx, info_rx) = unbounded(); + let (info_tx, info_rx) = bounded(100); let (stdout_tx, stdout_rx) = unbounded::>(); - let (task_tx, task_rx) = bounded::>(100); + let (task_tx, task_rx) = bounded::>(num_threads * 4); let mut handles = Vec::new(); for _ in 0..num_threads { @@ -456,7 +442,7 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { drop(stdout_tx); drop(info_tx); - // handles.push(make_info_printer(info_rx, hashes.len())); + handles.push(make_info_printer(info_rx, hashes.len())); handles.push(make_stdout_printer(stdout_rx)); let mut indices = @@ -508,8 +494,6 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { // complete as well. drop(task_tx); - tracing::debug!("Wainting for workers to finish."); - for handle in handles { match handle.join() { Ok(_) => {} @@ -523,5 +507,7 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { } } + let _ = std::io::stdout().write_all("\r".as_bytes()); + Ok(()) } From 6ada4c1c43298c11593422b0c0aa2330738ae5fe Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Wed, 20 Sep 2023 11:33:24 +0200 Subject: [PATCH 09/10] sdk: Add additional brute force prefixes --- crates/dtmt/src/cmd/experiment/brute_force_words.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs index 4bf8556..26d887f 100644 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -90,13 +90,14 @@ const LINE_FEED: u8 = 0x0A; const UNDERSCORE: u8 = 0x5F; const ZERO: u8 = 0x30; -const PREFIXES: [&str; 29] = [ +const PREFIXES: [&str; 36] = [ "", "content/characters/", "content/debug/", "content/decals/", "content/environment/", "content/fx/", + "content/fx/particles/", "content/gizmos/", "content/items/", "content/levels/", @@ -112,14 +113,20 @@ const PREFIXES: [&str; 29] = [ "content/vo/", "content/volume_types/", "content/weapons/", + "content/", + "core/", + "core/units/", "packages/boot_assets/", "packages/content/", "packages/game_scripts/", "packages/strings/", "packages/ui/", + "packages/", "wwise/events/", "wwise/packages/", "wwise/world_sound_fx/", + "wwise/events/weapons/", + "wwise/events/minions/", ]; fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> { @@ -153,7 +160,7 @@ fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> fn make_stdout_printer(rx: Receiver>) -> JoinHandle<()> { std::thread::spawn(move || { - let mut writer = BufWriter::new(std::io::stdout()); + let mut writer = std::io::stdout(); while let Ok(buf) = rx.recv() { writer.write_all(&buf).unwrap(); @@ -260,7 +267,7 @@ fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { // when we're about to print here. // So we strip that, to show just the generated part. // We also restrict the length to stay on a single line. - let prefix_len = PREFIXES[28].len(); + let prefix_len = PREFIXES[35].len(); // No need to wait for this let _ = state.info_tx.try_send(( count, From ae1e7e5aa6e8c0ca54da5a485cad9a2e0e64b869 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Fri, 22 Sep 2023 11:46:57 +0200 Subject: [PATCH 10/10] dtmt: Add word extraction algorithm for paths --- .../dtmt/src/cmd/experiment/extract_words.rs | 311 +++++++++++++++++- 1 file changed, 296 insertions(+), 15 deletions(-) diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs index 512038d..1a8cda5 100644 --- a/crates/dtmt/src/cmd/experiment/extract_words.rs +++ b/crates/dtmt/src/cmd/experiment/extract_words.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::HashMap; use std::path::PathBuf; use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; @@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command { ) } -#[derive(Copy, Clone, Debug, ValueEnum)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)] #[value(rename_all = "snake_case")] enum Algorithm { Alphabetic, @@ -45,6 +45,7 @@ enum Algorithm { Number, Hash32, Hash64, + Paths, } impl Algorithm { @@ -55,6 +56,8 @@ impl Algorithm { Self::Identifier => c.is_ascii_alphabetic(), Self::Number => c.is_numeric(), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, } } @@ -65,6 +68,8 @@ impl Algorithm { Self::Identifier => c.is_ascii_alphanumeric(), Self::Number => c.is_numeric(), Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, } } @@ -76,6 +81,8 @@ impl Algorithm { Self::Number => true, Self::Hash32 => len == 8, Self::Hash64 => len == 16, + // Supposed to be handled separately + Self::Paths => false, } } } @@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm { Algorithm::Number => "number", Algorithm::Hash32 => "hash32", Algorithm::Hash64 => "hash64", + Algorithm::Paths => "paths", } ) } } +#[derive(Copy, Clone, Debug)] +enum PathState { + Begin, + PathComponent, + PathSeparator, + Boundary, + NonWord, + End, +} + +#[tracing::instrument(skip(chars))] +fn extract_paths(chars: impl Iterator) -> Vec> { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut list = Vec::new(); + let mut path = Vec::new(); + let mut word = String::new(); + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some('/') => PathState::PathSeparator, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => { + path.push(word.clone()); + list.push(path.clone()); + + PathState::End + } + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => { + path.push(word.clone()); + word.clear(); + + PathState::PathSeparator + } + Some(c) if is_boundary(c) => { + path.push(word.clone()); + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::NonWord + } + }, + PathState::PathSeparator => match chars.next() { + None => { + list.push(path.clone()); + PathState::End + } + Some('/') => PathState::PathSeparator, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + list.push(path.clone()); + path.clear(); + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + path.clear(); + PathState::NonWord + } + }, + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + break 'machine; + } + } + } + + list +} + +#[tracing::instrument(skip(chars))] +fn algorithm_path_components(chars: impl Iterator, min_length: usize) { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut word = String::new(); + let mut lists = vec![HashMap::::new()]; + let mut index = 0; + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + // Ignore leading path separators to not trigger the logic of advancing + // the component count + Some('/') => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => PathState::PathSeparator, + Some(c) => { + if index > 0 && word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index = 0; + + if is_boundary(c) { + PathState::Boundary + } else { + PathState::NonWord + } + } + }, + PathState::PathSeparator => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index += 1; + if lists.get(index).is_none() { + lists.push(HashMap::new()); + } + + // Ignore multiple separators + while chars.next_if(|c| *c == '/').is_some() {} + + match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + index = 0; + PathState::Boundary + } + Some(_) => { + index = 0; + PathState::NonWord + } + } + } + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + break 'machine; + } + } + } + + for i in 0..lists.len() { + print!("Word {i}, Count {i},"); + } + println!(); + + let mut lines: Vec>> = Vec::new(); + + for (i, list) in lists.into_iter().enumerate() { + let mut entries = list.into_iter().collect::>(); + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + for (j, (word, count)) in entries.into_iter().enumerate() { + if let Some(line) = lines.get_mut(j) { + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + } else { + let mut line = Vec::new(); + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + lines.push(line); + } + } + } + + for line in lines.iter() { + for cell in line.iter() { + if let Some((word, count)) = cell { + print!("{},{},", word, count); + } else { + print!(",,"); + } + } + println!(); + } +} + #[derive(Copy, Clone, Debug)] enum State { Begin, @@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; let mut chars = content.chars(); + if *algorithm == Algorithm::Paths { + algorithm_path_components(chars, min_length); + return Ok(()); + } + let mut state = State::Begin; let mut word = String::new(); - let mut visited = HashSet::new(); + let mut visited = HashMap::new(); 'machine: loop { state = match state { @@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> }, State::Word => match chars.next() { None => { - if word.len() >= min_length - && algorithm.is_length(word.len()) - && !visited.contains(&word) - { - println!("{}", &word); - visited.insert(word.clone()); + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); } State::End } @@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> State::Word } Some(_) => { - if word.len() >= min_length - && algorithm.is_length(word.len()) - && !visited.contains(&word) - { - println!("{}", &word); - visited.insert(word.clone()); + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); } word.clear(); State::NonWord @@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> } } + let mut entries: Vec<(String, usize)> = visited.into_iter().collect(); + // Reverse sides during comparison to get "highest to lowest" + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + entries + .iter() + .for_each(|(word, count)| println!("{:016} {}", word, count)); + Ok(()) }