From 2daff544a55c3c4f95f50b6cb715fc4bcb73d5c1 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Wed, 17 Jul 2024 09:18:56 +0200
Subject: [PATCH 01/10] Add subcommand for experimental operations

These may be temporary ones that help during analyzing and developing
file formats, or or long term experiments.
---
 crates/dtmt/src/cmd/experiment/mod.rs | 17 +++++++++++++++++
 crates/dtmt/src/main.rs               |  3 +++
 2 files changed, 20 insertions(+)
 create mode 100644 crates/dtmt/src/cmd/experiment/mod.rs

diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs
new file mode 100644
index 0000000..b29f83a
--- /dev/null
+++ b/crates/dtmt/src/cmd/experiment/mod.rs
@@ -0,0 +1,17 @@
+use clap::{ArgMatches, Command};
+use color_eyre::Result;
+
+pub(crate) fn command_definition() -> Command {
+    Command::new("experiment")
+        .subcommand_required(true)
+        .about("A collection of utilities and experiments.")
+}
+
+#[tracing::instrument(skip_all)]
+pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
+    match matches.subcommand() {
+        _ => unreachable!(
+            "clap is configured to require a subcommand, and they're all handled above"
+        ),
+    }
+}
diff --git a/crates/dtmt/src/main.rs b/crates/dtmt/src/main.rs
index 2e10b17..b01956a 100644
--- a/crates/dtmt/src/main.rs
+++ b/crates/dtmt/src/main.rs
@@ -21,6 +21,7 @@ mod cmd {
     pub mod build;
     pub mod bundle;
     pub mod dictionary;
+    pub mod experiment;
     pub mod migrate;
     pub mod murmur;
     pub mod new;
@@ -56,6 +57,7 @@ async fn main() -> Result<()> {
         .subcommand(cmd::build::command_definition())
         .subcommand(cmd::bundle::command_definition())
         .subcommand(cmd::dictionary::command_definition())
+        .subcommand(cmd::experiment::command_definition())
         .subcommand(cmd::migrate::command_definition())
         .subcommand(cmd::murmur::command_definition())
         .subcommand(cmd::new::command_definition())
@@ -133,6 +135,7 @@ async fn main() -> Result<()> {
         Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?,
         Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?,
         Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?,
+        Some(("experiment", sub_matches)) => cmd::experiment::run(ctx, sub_matches).await?,
         Some(("migrate", sub_matches)) => cmd::migrate::run(ctx, sub_matches).await?,
         Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?,
         Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,

From 94347d57f9790ff61188a64192e98b602d891afb Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Sat, 16 Sep 2023 18:43:52 +0200
Subject: [PATCH 02/10] dtmt: Add command to extract words from file

As part of trying to brute force values for the dictionary,
this allows extracting candidate words from a file.
---
 .../dtmt/src/cmd/experiment/extract_words.rs  | 182 ++++++++++++++++++
 crates/dtmt/src/cmd/experiment/mod.rs         |   4 +
 2 files changed, 186 insertions(+)
 create mode 100644 crates/dtmt/src/cmd/experiment/extract_words.rs

diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs
new file mode 100644
index 0000000..512038d
--- /dev/null
+++ b/crates/dtmt/src/cmd/experiment/extract_words.rs
@@ -0,0 +1,182 @@
+use std::collections::HashSet;
+use std::path::PathBuf;
+
+use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
+use color_eyre::eyre::Context;
+use color_eyre::Result;
+use tokio::fs;
+
+pub(crate) fn command_definition() -> Command {
+    Command::new("extract-words")
+        .about(
+            "Extract unique alphanumeric sequences  that match common identifier rules from the given file. \
+                Only ASCII is supported.",
+        )
+        .arg(
+            Arg::new("file")
+                .required(true)
+                .value_parser(value_parser!(PathBuf))
+                .help("Path to the file to extract words from."),
+        )
+        .arg(
+            Arg::new("min-length")
+                .help("Minimum length to consider a word.")
+                .long("min-length")
+                .short('m')
+                .default_value("3")
+                .value_parser(value_parser!(usize))
+        )
+        .arg(
+            Arg::new("algorithm")
+                .help("The algorithm to determine matching words")
+                .long("algorithm")
+                .short('a')
+                .default_value("identifier")
+                .value_parser(value_parser!(Algorithm))
+        )
+}
+
+#[derive(Copy, Clone, Debug, ValueEnum)]
+#[value(rename_all = "snake_case")]
+enum Algorithm {
+    Alphabetic,
+    Alphanumeric,
+    Identifier,
+    Number,
+    Hash32,
+    Hash64,
+}
+
+impl Algorithm {
+    fn is_start(&self, c: char) -> bool {
+        match self {
+            Self::Alphabetic => c.is_ascii_alphabetic(),
+            Self::Alphanumeric => c.is_ascii_alphanumeric(),
+            Self::Identifier => c.is_ascii_alphabetic(),
+            Self::Number => c.is_numeric(),
+            Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
+        }
+    }
+
+    fn is_body(&self, c: char) -> bool {
+        match self {
+            Self::Alphabetic => c.is_ascii_alphabetic(),
+            Self::Alphanumeric => c.is_ascii_alphanumeric(),
+            Self::Identifier => c.is_ascii_alphanumeric(),
+            Self::Number => c.is_numeric(),
+            Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
+        }
+    }
+
+    fn is_length(&self, len: usize) -> bool {
+        match self {
+            Self::Alphabetic => true,
+            Self::Alphanumeric => true,
+            Self::Identifier => true,
+            Self::Number => true,
+            Self::Hash32 => len == 8,
+            Self::Hash64 => len == 16,
+        }
+    }
+}
+
+impl std::fmt::Display for Algorithm {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Algorithm::Alphabetic => "alphabetic",
+                Algorithm::Alphanumeric => "alphanumeric",
+                Algorithm::Identifier => "identifier",
+                Algorithm::Number => "number",
+                Algorithm::Hash32 => "hash32",
+                Algorithm::Hash64 => "hash64",
+            }
+        )
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+enum State {
+    Begin,
+    NonWord,
+    Word,
+    End,
+}
+
+#[tracing::instrument(skip_all)]
+pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
+    let path = matches
+        .get_one::<PathBuf>("file")
+        .expect("missing required parameter");
+
+    let algorithm = matches
+        .get_one::<Algorithm>("algorithm")
+        .expect("parameter has default");
+
+    let min_length = matches
+        .get_one::<usize>("min-length")
+        .copied()
+        .expect("paramter has default");
+
+    let content = fs::read_to_string(&path)
+        .await
+        .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
+    let mut chars = content.chars();
+
+    let mut state = State::Begin;
+    let mut word = String::new();
+    let mut visited = HashSet::new();
+
+    'machine: loop {
+        state = match state {
+            State::Begin => match chars.next() {
+                None => State::End,
+                Some(c) if algorithm.is_start(c) => {
+                    word.push(c);
+                    State::Word
+                }
+                Some(_) => State::NonWord,
+            },
+            State::End => break 'machine,
+            State::NonWord => match chars.next() {
+                None => State::End,
+                Some(c) if algorithm.is_body(c) => {
+                    word.push(c);
+                    State::Word
+                }
+                Some(_) => State::NonWord,
+            },
+            State::Word => match chars.next() {
+                None => {
+                    if word.len() >= min_length
+                        && algorithm.is_length(word.len())
+                        && !visited.contains(&word)
+                    {
+                        println!("{}", &word);
+                        visited.insert(word.clone());
+                    }
+                    State::End
+                }
+                Some(c) if algorithm.is_body(c) => {
+                    word.push(c);
+                    State::Word
+                }
+                Some(_) => {
+                    if word.len() >= min_length
+                        && algorithm.is_length(word.len())
+                        && !visited.contains(&word)
+                    {
+                        println!("{}", &word);
+                        visited.insert(word.clone());
+                    }
+                    word.clear();
+                    State::NonWord
+                }
+            },
+        }
+    }
+
+    Ok(())
+}
diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs
index b29f83a..51e5fc7 100644
--- a/crates/dtmt/src/cmd/experiment/mod.rs
+++ b/crates/dtmt/src/cmd/experiment/mod.rs
@@ -1,15 +1,19 @@
 use clap::{ArgMatches, Command};
 use color_eyre::Result;
 
+mod extract_words;
+
 pub(crate) fn command_definition() -> Command {
     Command::new("experiment")
         .subcommand_required(true)
         .about("A collection of utilities and experiments.")
+        .subcommand(extract_words::command_definition())
 }
 
 #[tracing::instrument(skip_all)]
 pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     match matches.subcommand() {
+        Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
         _ => unreachable!(
             "clap is configured to require a subcommand, and they're all handled above"
         ),

From 6485dae27bc152a6eef7533d548e2c8f871b6563 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Sat, 16 Sep 2023 19:03:04 +0200
Subject: [PATCH 03/10] experiment: Add command to create word permutations

This creates candidate values to brute force dictionary entries with,
by building combinations from a word list and delimiters.
---
 Cargo.lock                                    |  35 ++-
 crates/dtmt/Cargo.toml                        |   2 +
 .../src/cmd/experiment/brute_force_words.rs   | 239 ++++++++++++++++++
 crates/dtmt/src/cmd/experiment/mod.rs         |   3 +
 4 files changed, 277 insertions(+), 2 deletions(-)
 create mode 100644 crates/dtmt/src/cmd/experiment/brute_force_words.rs

diff --git a/Cargo.lock b/Cargo.lock
index a251de9..dac07e9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -161,6 +161,17 @@ dependencies = [
  "system-deps",
 ]
 
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi 0.1.19",
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.3.0"
@@ -212,7 +223,7 @@ dependencies = [
  "bitflags 2.5.0",
  "cexpr",
  "clang-sys",
- "itertools",
+ "itertools 0.12.1",
  "lazy_static",
  "lazycell",
  "log",
@@ -927,6 +938,7 @@ name = "dtmt"
 version = "0.3.0"
 dependencies = [
  "async-recursion",
+ "atty",
  "clap",
  "cli-table",
  "color-eyre",
@@ -936,6 +948,7 @@ dependencies = [
  "futures",
  "futures-util",
  "glob",
+ "itertools 0.11.0",
  "luajit2-sys",
  "nanorand",
  "notify",
@@ -1598,6 +1611,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.9"
@@ -1858,6 +1880,15 @@ version = "1.70.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
 
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itertools"
 version = "0.12.1"
@@ -2267,7 +2298,7 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.9",
  "libc",
 ]
 
diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml
index d836a50..8018a8b 100644
--- a/crates/dtmt/Cargo.toml
+++ b/crates/dtmt/Cargo.toml
@@ -33,6 +33,8 @@ async-recursion = "1.0.2"
 notify = "6.1.1"
 luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
 shlex = { version = "1.2.0", optional = true }
+atty = "0.2.14"
+itertools = "0.11.0"
 
 [dev-dependencies]
 tempfile = "3.3.0"
diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
new file mode 100644
index 0000000..6bf81bb
--- /dev/null
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -0,0 +1,239 @@
+use std::path::PathBuf;
+
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
+use color_eyre::eyre::{self, Context};
+use color_eyre::Result;
+use itertools::Itertools;
+use tokio::fs;
+
+pub(crate) fn command_definition() -> Command {
+    Command::new("brute-force-words")
+        .about(
+            "Given a list of words and a set of delimiters, iteratevily creates permutations \
+                of growing length.\n\
+                Delimiters are placed between every word in the result.\n\n\
+                Example: \
+                Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \
+                words will be\n\
+                - packages\n\
+                - boot\n\
+                - packages/packages\n\
+                - packages_packages\n\
+                - packages/boot\n\
+                - packages_boot\n\
+                - boot/packages\n\
+                - boot_packages\n\
+                - boot/boot\n\
+                - boot_boot",
+        )
+        .arg(
+            Arg::new("delimiter")
+                .help(
+                    "The delimiters to put between the words. \
+                        All permutations of this list will be tried for every string of words.\n\
+                        Specify multiple times to set multiple values.\n\
+                        Defaults to ['/', '_'].",
+                )
+                .short('d')
+                .long("delimiter")
+                .action(ArgAction::Append),
+        )
+        .arg(
+            Arg::new("max-length")
+                .help("The maximum number of words up to which to build strings.")
+                .long("max")
+                .long("max-length")
+                .short('m')
+                .default_value("5")
+                .value_parser(value_parser!(usize)),
+        )
+        .arg(
+            Arg::new("continue")
+                .help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.")
+                .short('c')
+                .long("continue")
+        )
+        .arg(
+            Arg::new("words")
+                .help("Path to a file containing words line by line.")
+                .required(true)
+                .value_parser(value_parser!(PathBuf)),
+        )
+}
+
+#[tracing::instrument(skip_all)]
+#[allow(clippy::mut_range_bound)]
+pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
+    let max_length: usize = matches
+        .get_one::<usize>("max-length")
+        .copied()
+        .expect("parameter has default");
+
+    let words: Vec<String> = {
+        let path = matches
+            .get_one::<PathBuf>("words")
+            .expect("missing required parameter");
+
+        let file = fs::read_to_string(&path)
+            .await
+            .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
+
+        file.lines().map(str::to_string).collect()
+    };
+
+    if words.is_empty() {
+        eyre::bail!("Word list must not be empty");
+    }
+
+    let mut delimiters: Vec<String> = matches
+        .get_many::<String>("delimiter")
+        .unwrap_or_default()
+        .cloned()
+        .collect();
+
+    if delimiters.is_empty() {
+        delimiters.push(String::from("/"));
+        delimiters.push(String::from("_"));
+    }
+
+    let delimiters_len = delimiters.len();
+
+    let word_count = words.len();
+    tracing::info!("{} words to try", word_count);
+
+    // To be able to easily combine the permutations of words and delimiters,
+    // we turn the latter into a pre-defined list of all permutations of delimiters
+    // that are possible at the given amount of words.
+    // Combining `Iterator::cycle` with `Itertools::permutations` works, but
+    // with a high `max_length`, it runs OOM.
+    // So we basically have to implement a smaller version of the iterative algorithm we use later on
+    // to build permutations of the actual words.
+    let delimiter_lists = {
+        let mut indices = vec![0; max_length - 1];
+        let mut list = Vec::new();
+
+        for _ in 0..delimiters_len.pow(max_length as u32 - 1) {
+            list.push(indices.iter().map(|i| &delimiters[*i]).collect::<Vec<_>>());
+
+            for v in indices.iter_mut() {
+                if *v >= delimiters_len - 1 {
+                    *v = 0;
+                    break;
+                } else {
+                    *v += 1;
+                }
+            }
+        }
+
+        list
+    };
+
+    tracing::debug!("{:?}", delimiter_lists);
+
+    let mut count = 0u64;
+
+    let mut indices = if let Some(cont) = matches.get_one::<String>("continue").cloned() {
+        let mut splits = vec![cont.clone()];
+
+        for delim in delimiters.iter() {
+            splits = splits
+                .iter()
+                .flat_map(|s| s.split(delim))
+                .map(|s| s.to_string())
+                .collect();
+        }
+
+        let indices = splits
+            .into_iter()
+            .map(|s| {
+                words
+                    .iter()
+                    .enumerate()
+                    .find(|(_, v)| s == **v)
+                    .map(|(i, _)| i)
+                    .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
+            })
+            .collect::<Result<_>>()?;
+
+        tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
+
+        indices
+    } else {
+        vec![0]
+    };
+    let mut indices_len = indices.len();
+    let mut sequence = indices
+        .iter()
+        .map(|index| words[*index].as_str())
+        .collect::<Vec<_>>();
+
+    // Prevent re-allocation by reserving as much as we need upfront
+    indices.reserve(max_length);
+    sequence.reserve(max_length);
+
+    'outer: loop {
+        // We only want delimiters between words, so we keep that iterator shorter by
+        // one.
+        let delimiter_count = sequence.len() as u32 - 1;
+
+        tracing::trace!(
+            "{} | {:?} -> {:?}",
+            delimiters_len.pow(delimiter_count),
+            indices,
+            sequence
+        );
+
+        for delims in delimiter_lists
+            .iter()
+            .take(delimiters_len.pow(delimiter_count))
+        {
+            let delims = delims
+                .iter()
+                .map(|s| s.as_str())
+                .take(delimiter_count as usize);
+            let s: String = sequence
+                .iter()
+                .copied()
+                .interleave(delims)
+                .flat_map(|word| word.chars())
+                .collect();
+
+            count = count.wrapping_add(1);
+
+            if count % 500000 == 0 {
+                tracing::info!("{} words generated", count);
+            }
+
+            println!("{}", s);
+        }
+
+        for i in 0..indices_len {
+            let index = indices.get_mut(i).unwrap();
+            let word = sequence.get_mut(i).unwrap();
+
+            if *index >= word_count - 1 {
+                *index = 0;
+                *word = words[*index].as_str();
+
+                if indices.get(i + 1).is_none() {
+                    indices.push(0);
+                    sequence.push(words[0].as_str());
+
+                    indices_len += 1;
+
+                    if indices_len > max_length {
+                        break 'outer;
+                    }
+
+                    break;
+                }
+            } else {
+                *index += 1;
+                *word = words[*index].as_str();
+                break;
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs
index 51e5fc7..9ceb3b9 100644
--- a/crates/dtmt/src/cmd/experiment/mod.rs
+++ b/crates/dtmt/src/cmd/experiment/mod.rs
@@ -1,18 +1,21 @@
 use clap::{ArgMatches, Command};
 use color_eyre::Result;
 
+mod brute_force_words;
 mod extract_words;
 
 pub(crate) fn command_definition() -> Command {
     Command::new("experiment")
         .subcommand_required(true)
         .about("A collection of utilities and experiments.")
+        .subcommand(brute_force_words::command_definition())
         .subcommand(extract_words::command_definition())
 }
 
 #[tracing::instrument(skip_all)]
 pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     match matches.subcommand() {
+        Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await,
         Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
         _ => unreachable!(
             "clap is configured to require a subcommand, and they're all handled above"

From 0d1193a12688567fc30b963b3d79c20d96bf55c4 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Mon, 18 Sep 2023 10:26:58 +0200
Subject: [PATCH 04/10] sdk: Improve word generation throughput

It seems that the simple `println!()` is really bad when the goal
is to write a lot of data to stdout.
Presumably because it's unbuffered, but also because it required the
preceding code to do a lot of allocations.

This was replaced with a buffered writer on stdout, as well as an extra
`Vec<u8>` that I can write everything to directly from the word and
delimiter iterators, without allocating a single new structure.
---
 .../src/cmd/experiment/brute_force_words.rs   | 76 ++++++++++++++++---
 1 file changed, 67 insertions(+), 9 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index 6bf81bb..d2891f9 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -5,6 +5,7 @@ use color_eyre::eyre::{self, Context};
 use color_eyre::Result;
 use itertools::Itertools;
 use tokio::fs;
+use tokio::io::{AsyncWriteExt, BufWriter};
 
 pub(crate) fn command_definition() -> Command {
     Command::new("brute-force-words")
@@ -98,6 +99,38 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
 
     let delimiters_len = delimiters.len();
 
+    let prefixes = [
+        "",
+        "content/characters/",
+        "content/debug/",
+        "content/decals/",
+        "content/environment/",
+        "content/fx/",
+        "content/gizmos/",
+        "content/items/",
+        "content/levels/",
+        "content/liquid_area/",
+        "content/localization/",
+        "content/materials/",
+        "content/minion_impact_assets/",
+        "content/pickups/",
+        "content/shading_environments/",
+        "content/textures/",
+        "content/ui/",
+        "content/videos/",
+        "content/vo/",
+        "content/volume_types/",
+        "content/weapons/",
+        "packages/boot_assets/",
+        "packages/content/",
+        "packages/game_scripts/",
+        "packages/strings/",
+        "packages/ui/",
+        "wwise/events/",
+        "wwise/packages/",
+        "wwise/world_sound_fx/",
+    ];
+
     let word_count = words.len();
     tracing::info!("{} words to try", word_count);
 
@@ -171,6 +204,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
     indices.reserve(max_length);
     sequence.reserve(max_length);
 
+    let mut writer = BufWriter::new(tokio::io::stdout());
+    let mut buf = Vec::with_capacity(1024);
+
+    const LINE_FEED: u8 = 0x0A;
+    const UNDERSCORE: u8 = 0x5F;
+    const ZERO: u8 = 0x30;
+
     'outer: loop {
         // We only want delimiters between words, so we keep that iterator shorter by
         // one.
@@ -191,20 +231,38 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                 .iter()
                 .map(|s| s.as_str())
                 .take(delimiter_count as usize);
-            let s: String = sequence
-                .iter()
-                .copied()
-                .interleave(delims)
-                .flat_map(|word| word.chars())
-                .collect();
+            let s = sequence.iter().copied().interleave(delims.clone());
 
             count = count.wrapping_add(1);
 
-            if count % 500000 == 0 {
-                tracing::info!("{} words generated", count);
+            buf.clear();
+
+            for prefix in prefixes.iter() {
+                buf.extend_from_slice(prefix.as_bytes());
+                s.clone()
+                    .for_each(|word| buf.extend_from_slice(word.as_bytes()));
+                // buf.extend_from_slice(s.as_bytes());
+                buf.push(LINE_FEED);
+
+                for i in 0..=9 {
+                    buf.extend_from_slice(prefix.as_bytes());
+                    s.clone()
+                        .for_each(|word| buf.extend_from_slice(word.as_bytes()));
+                    buf.push(UNDERSCORE);
+                    buf.push(ZERO + i);
+                    buf.push(LINE_FEED);
+
+                    buf.extend_from_slice(prefix.as_bytes());
+                    s.clone()
+                        .for_each(|word| buf.extend_from_slice(word.as_bytes()));
+                    buf.push(UNDERSCORE);
+                    buf.push(ZERO);
+                    buf.push(ZERO + i);
+                    buf.push(LINE_FEED);
+                }
             }
 
-            println!("{}", s);
+            writer.write_all(&buf).await?;
         }
 
         for i in 0..indices_len {

From 4480144d92db48f1f1515ef046984f941f14a89f Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Mon, 18 Sep 2023 13:29:42 +0200
Subject: [PATCH 05/10] sdk: Implement guessing a list of hashes

While the approach to generate and store a list of strings does allow
for this list to be re-used in the future, the I/O involved turned out
to be quite costly.

While the generation can run at up to 500 MiB/s, even compressing that
on the fly doesn't reach fast enough write speeds on a HDD.
And compression is also necessary to store this amount of data
(generation reached two TB of raw data with a word length of just three,
which is still 600 GB compressed).
But compression also makes working with that data a lot harder.

So this instead combines both the generation and search into a single
step. The intermediate result of the generation is therefore lost,
but the overall pipeline is much faster.
---
 .../src/cmd/experiment/brute_force_words.rs   | 120 +++++++++++++++---
 1 file changed, 102 insertions(+), 18 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index d2891f9..bb3aa9e 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -1,11 +1,14 @@
+use std::collections::HashSet;
 use std::path::PathBuf;
 
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use color_eyre::eyre::{self, Context};
 use color_eyre::Result;
 use itertools::Itertools;
+use sdk::murmur::Murmur64;
 use tokio::fs;
-use tokio::io::{AsyncWriteExt, BufWriter};
+use tokio::io::AsyncWriteExt;
+use tokio::time::Instant;
 
 pub(crate) fn command_definition() -> Command {
     Command::new("brute-force-words")
@@ -60,6 +63,15 @@ pub(crate) fn command_definition() -> Command {
                 .required(true)
                 .value_parser(value_parser!(PathBuf)),
         )
+        .arg(
+            Arg::new("hashes")
+                .help(
+                    "Path to a file containing the hashes to attempt to brute force. \
+                        Hashes are expected in hexadecimal notiation. \
+                        Only 64-bit hashes are supported."
+                )
+                .value_parser(value_parser!(PathBuf)),
+        )
 }
 
 #[tracing::instrument(skip_all)]
@@ -86,6 +98,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         eyre::bail!("Word list must not be empty");
     }
 
+    let hashes = if let Some(path) = matches.get_one::<PathBuf>("hashes") {
+        let content = fs::read_to_string(&path)
+            .await
+            .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
+
+        let hashes: Result<HashSet<_>, _> = content
+            .lines()
+            .map(|s| u64::from_str_radix(s, 16).map(Murmur64::from))
+            .collect();
+
+        let hashes = hashes?;
+
+        tracing::trace!("{:?}", hashes);
+
+        Some(hashes)
+    } else {
+        None
+    };
+
     let mut delimiters: Vec<String> = matches
         .get_many::<String>("delimiter")
         .unwrap_or_default()
@@ -163,8 +194,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
 
     tracing::debug!("{:?}", delimiter_lists);
 
-    let mut count = 0u64;
-
     let mut indices = if let Some(cont) = matches.get_one::<String>("continue").cloned() {
         let mut splits = vec![cont.clone()];
 
@@ -204,7 +233,12 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
     indices.reserve(max_length);
     sequence.reserve(max_length);
 
-    let mut writer = BufWriter::new(tokio::io::stdout());
+    let mut count: usize = 0;
+    let mut found: usize = 0;
+    let mut start = Instant::now();
+
+    // let mut writer = BufWriter::new(tokio::io::stdout());
+    let mut writer = tokio::io::stdout();
     let mut buf = Vec::with_capacity(1024);
 
     const LINE_FEED: u8 = 0x0A;
@@ -216,13 +250,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         // one.
         let delimiter_count = sequence.len() as u32 - 1;
 
-        tracing::trace!(
-            "{} | {:?} -> {:?}",
-            delimiters_len.pow(delimiter_count),
-            indices,
-            sequence
-        );
-
         for delims in delimiter_lists
             .iter()
             .take(delimiters_len.pow(delimiter_count))
@@ -233,16 +260,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                 .take(delimiter_count as usize);
             let s = sequence.iter().copied().interleave(delims.clone());
 
-            count = count.wrapping_add(1);
-
             buf.clear();
 
             for prefix in prefixes.iter() {
                 buf.extend_from_slice(prefix.as_bytes());
                 s.clone()
                     .for_each(|word| buf.extend_from_slice(word.as_bytes()));
-                // buf.extend_from_slice(s.as_bytes());
-                buf.push(LINE_FEED);
+
+                if let Some(hashes) = &hashes {
+                    let hash = Murmur64::hash(&buf);
+                    if hashes.contains(&hash) {
+                        found += 1;
+                        buf.push(LINE_FEED);
+                        writer.write_all(&buf).await?;
+                    }
+
+                    buf.clear();
+                } else {
+                    buf.push(LINE_FEED);
+                }
 
                 for i in 0..=9 {
                     buf.extend_from_slice(prefix.as_bytes());
@@ -250,7 +286,19 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                         .for_each(|word| buf.extend_from_slice(word.as_bytes()));
                     buf.push(UNDERSCORE);
                     buf.push(ZERO + i);
-                    buf.push(LINE_FEED);
+
+                    if let Some(hashes) = &hashes {
+                        let hash = Murmur64::hash(&buf);
+                        if hashes.contains(&hash) {
+                            found += 1;
+                            buf.push(LINE_FEED);
+                            writer.write_all(&buf).await?;
+                        }
+
+                        buf.clear();
+                    } else {
+                        buf.push(LINE_FEED);
+                    }
 
                     buf.extend_from_slice(prefix.as_bytes());
                     s.clone()
@@ -258,11 +306,47 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                     buf.push(UNDERSCORE);
                     buf.push(ZERO);
                     buf.push(ZERO + i);
-                    buf.push(LINE_FEED);
+
+                    if let Some(hashes) = &hashes {
+                        let hash = Murmur64::hash(&buf);
+                        if hashes.contains(&hash) {
+                            found += 1;
+                            buf.push(LINE_FEED);
+                            writer.write_all(&buf).await?;
+                        }
+
+                        buf.clear();
+                    } else {
+                        buf.push(LINE_FEED);
+                    }
                 }
             }
 
-            writer.write_all(&buf).await?;
+            if let Some(hashes) = &hashes {
+                count += prefixes.len() * 20;
+
+                let dur = Instant::now() - start;
+                if dur.as_secs() >= 1 {
+                    let hashes_len = hashes.len();
+                    // Don't care when it finishes, don't care if it fails.
+                    tokio::spawn(async move {
+                        let _ = tokio::io::stderr()
+                            .write_all(
+                                format!(
+                                    "\r{} hashes per second, {}/{} found",
+                                    count, found, hashes_len
+                                )
+                                .as_bytes(),
+                            )
+                            .await;
+                    });
+
+                    start = Instant::now();
+                    count = 0;
+                }
+            } else {
+                writer.write_all(&buf).await?;
+            }
         }
 
         for i in 0..indices_len {

From 951a7f82c0a2f1532df9a14ff622504a03bc20a7 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Tue, 19 Sep 2023 15:28:40 +0200
Subject: [PATCH 06/10] sdk: Improve word generation

---
 .../src/cmd/experiment/brute_force_words.rs   | 164 +++++++++---------
 1 file changed, 79 insertions(+), 85 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index bb3aa9e..7e93dcc 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -70,6 +70,7 @@ pub(crate) fn command_definition() -> Command {
                         Hashes are expected in hexadecimal notiation. \
                         Only 64-bit hashes are supported."
                 )
+                .required(true)
                 .value_parser(value_parser!(PathBuf)),
         )
 }
@@ -98,7 +99,10 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         eyre::bail!("Word list must not be empty");
     }
 
-    let hashes = if let Some(path) = matches.get_one::<PathBuf>("hashes") {
+    let hashes = {
+        let path = matches
+            .get_one::<PathBuf>("hashes")
+            .expect("missing required argument");
         let content = fs::read_to_string(&path)
             .await
             .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
@@ -112,9 +116,7 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
 
         tracing::trace!("{:?}", hashes);
 
-        Some(hashes)
-    } else {
-        None
+        hashes
     };
 
     let mut delimiters: Vec<String> = matches
@@ -250,103 +252,95 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         // one.
         let delimiter_count = sequence.len() as u32 - 1;
 
-        for delims in delimiter_lists
-            .iter()
-            .take(delimiters_len.pow(delimiter_count))
-        {
-            let delims = delims
-                .iter()
-                .map(|s| s.as_str())
-                .take(delimiter_count as usize);
-            let s = sequence.iter().copied().interleave(delims.clone());
-
+        for prefix in prefixes.iter().map(|p| p.as_bytes()) {
             buf.clear();
 
-            for prefix in prefixes.iter() {
-                buf.extend_from_slice(prefix.as_bytes());
-                s.clone()
+            // We can keep the prefix at the front of the buffer and only
+            // replace the parts after that.
+            let prefix_len = prefix.len();
+            buf.extend_from_slice(prefix);
+
+            for delims in delimiter_lists
+                .iter()
+                .take(delimiters_len.pow(delimiter_count))
+            {
+                buf.truncate(prefix_len);
+
+                let delims = delims
+                    .iter()
+                    .map(|s| s.as_str())
+                    .take(delimiter_count as usize);
+                sequence
+                    .iter()
+                    .copied()
+                    .interleave(delims.clone())
                     .for_each(|word| buf.extend_from_slice(word.as_bytes()));
 
-                if let Some(hashes) = &hashes {
-                    let hash = Murmur64::hash(&buf);
-                    if hashes.contains(&hash) {
-                        found += 1;
-                        buf.push(LINE_FEED);
-                        writer.write_all(&buf).await?;
-                    }
+                count += 1;
 
-                    buf.clear();
-                } else {
+                let hash = Murmur64::hash(&buf);
+                if hashes.contains(&hash) {
+                    found += 1;
                     buf.push(LINE_FEED);
-                }
+                    writer.write_all(&buf).await?;
+                } else {
+                    let word_len = buf.len();
 
-                for i in 0..=9 {
-                    buf.extend_from_slice(prefix.as_bytes());
-                    s.clone()
-                        .for_each(|word| buf.extend_from_slice(word.as_bytes()));
-                    buf.push(UNDERSCORE);
-                    buf.push(ZERO + i);
+                    // If the regular word itself didn't match, we check
+                    // for numbered suffixes.
+                    // For now, we only check up to `09` to avoid more complex logic
+                    // writing into the buffer.
+                    // Packages that contain files with higher numbers than this
+                    // should hopefully become easier to spot once a good number of
+                    // hashes is found.
+                    for i in 1..=9 {
+                        buf.truncate(word_len);
+                        buf.push(UNDERSCORE);
+                        buf.push(ZERO);
+                        buf.push(ZERO + i);
+
+                        count += 1;
 
-                    if let Some(hashes) = &hashes {
                         let hash = Murmur64::hash(&buf);
                         if hashes.contains(&hash) {
                             found += 1;
                             buf.push(LINE_FEED);
                             writer.write_all(&buf).await?;
+                        } else {
+                            break;
                         }
-
-                        buf.clear();
-                    } else {
-                        buf.push(LINE_FEED);
-                    }
-
-                    buf.extend_from_slice(prefix.as_bytes());
-                    s.clone()
-                        .for_each(|word| buf.extend_from_slice(word.as_bytes()));
-                    buf.push(UNDERSCORE);
-                    buf.push(ZERO);
-                    buf.push(ZERO + i);
-
-                    if let Some(hashes) = &hashes {
-                        let hash = Murmur64::hash(&buf);
-                        if hashes.contains(&hash) {
-                            found += 1;
-                            buf.push(LINE_FEED);
-                            writer.write_all(&buf).await?;
-                        }
-
-                        buf.clear();
-                    } else {
-                        buf.push(LINE_FEED);
                     }
                 }
             }
+        }
 
-            if let Some(hashes) = &hashes {
-                count += prefixes.len() * 20;
+        let dur = Instant::now() - start;
+        if dur.as_secs() >= 1 {
+            let hashes_len = hashes.len();
+            let s = String::from_utf8_lossy(&buf);
+            // The last prefix in the set is the one that will stay in the buffer
+            // when we're about to print here.
+            // So we strip that, to show just the generated part.
+            // We also restrict the length to stay on a single line.
+            let prefix_len = prefixes[28].len();
+            let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
+                .trim_end()
+                .to_string();
+            // Don't care when it finishes, don't care if it fails.
+            tokio::spawn(async move {
+                let _ = tokio::io::stderr()
+                    .write_all(
+                        format!(
+                            "\r{:8} hashes per second | {:6}/{} found | {:<60}",
+                            count, found, hashes_len, s
+                        )
+                        .as_bytes(),
+                    )
+                    .await;
+            });
 
-                let dur = Instant::now() - start;
-                if dur.as_secs() >= 1 {
-                    let hashes_len = hashes.len();
-                    // Don't care when it finishes, don't care if it fails.
-                    tokio::spawn(async move {
-                        let _ = tokio::io::stderr()
-                            .write_all(
-                                format!(
-                                    "\r{} hashes per second, {}/{} found",
-                                    count, found, hashes_len
-                                )
-                                .as_bytes(),
-                            )
-                            .await;
-                    });
-
-                    start = Instant::now();
-                    count = 0;
-                }
-            } else {
-                writer.write_all(&buf).await?;
-            }
+            start = Instant::now();
+            count = 0;
         }
 
         for i in 0..indices_len {
@@ -358,15 +352,15 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                 *word = words[*index].as_str();
 
                 if indices.get(i + 1).is_none() {
-                    indices.push(0);
-                    sequence.push(words[0].as_str());
-
                     indices_len += 1;
 
                     if indices_len > max_length {
                         break 'outer;
                     }
 
+                    indices.push(0);
+                    sequence.push(words[0].as_str());
+
                     break;
                 }
             } else {

From b366185a63f8182b1c9d8e32b16f738a42f7a912 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Tue, 19 Sep 2023 15:29:40 +0200
Subject: [PATCH 07/10] sdk: Implement worker pool for word generation

Massive speed improvement. The index generation is really fast,
and it appears that even worker numbers way higher than the core/thread
count still increase the throughput slightly.

The only missing part is the info output. That's broken, currently.
---
 Cargo.lock                                    |  49 ++
 crates/dtmt/Cargo.toml                        |   1 +
 .../src/cmd/experiment/brute_force_words.rs   | 544 +++++++++++-------
 crates/dtmt/src/cmd/experiment/mod.rs         |   4 +-
 4 files changed, 401 insertions(+), 197 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dac07e9..3a02b55 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -658,6 +658,20 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crossbeam"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
+dependencies = [
+ "cfg-if",
+ "crossbeam-channel",
+ "crossbeam-deque",
+ "crossbeam-epoch",
+ "crossbeam-queue",
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.12"
@@ -667,6 +681,40 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+dependencies = [
+ "cfg-if",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
+dependencies = [
+ "autocfg",
+ "cfg-if",
+ "crossbeam-utils",
+ "memoffset 0.9.1",
+ "scopeguard",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.20"
@@ -943,6 +991,7 @@ dependencies = [
  "cli-table",
  "color-eyre",
  "confy",
+ "crossbeam",
  "csv-async",
  "dtmt-shared",
  "futures",
diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml
index 8018a8b..e80feaa 100644
--- a/crates/dtmt/Cargo.toml
+++ b/crates/dtmt/Cargo.toml
@@ -35,6 +35,7 @@ luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
 shlex = { version = "1.2.0", optional = true }
 atty = "0.2.14"
 itertools = "0.11.0"
+crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] }
 
 [dev-dependencies]
 tempfile = "3.3.0"
diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index 7e93dcc..aa15003 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -1,13 +1,16 @@
 use std::collections::HashSet;
+use std::fs;
+use std::io::Write;
 use std::path::PathBuf;
+use std::sync::Arc;
+use std::thread::JoinHandle;
 
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use color_eyre::eyre::{self, Context};
 use color_eyre::Result;
+use crossbeam::channel::{bounded, unbounded, Receiver, Sender};
 use itertools::Itertools;
 use sdk::murmur::Murmur64;
-use tokio::fs;
-use tokio::io::AsyncWriteExt;
 use tokio::time::Instant;
 
 pub(crate) fn command_definition() -> Command {
@@ -57,6 +60,14 @@ pub(crate) fn command_definition() -> Command {
                 .short('c')
                 .long("continue")
         )
+        .arg(
+            Arg::new("threads")
+                .help("The number of workers to run in parallel.")
+                .long("threads")
+                .short('n')
+                .default_value("6")
+                .value_parser(value_parser!(usize))
+        )
         .arg(
             Arg::new("words")
                 .help("Path to a file containing words line by line.")
@@ -75,36 +86,307 @@ pub(crate) fn command_definition() -> Command {
         )
 }
 
+const LINE_FEED: u8 = 0x0A;
+const UNDERSCORE: u8 = 0x5F;
+const ZERO: u8 = 0x30;
+
+const PREFIXES: [&str; 29] = [
+    "",
+    "content/characters/",
+    "content/debug/",
+    "content/decals/",
+    "content/environment/",
+    "content/fx/",
+    "content/gizmos/",
+    "content/items/",
+    "content/levels/",
+    "content/liquid_area/",
+    "content/localization/",
+    "content/materials/",
+    "content/minion_impact_assets/",
+    "content/pickups/",
+    "content/shading_environments/",
+    "content/textures/",
+    "content/ui/",
+    "content/videos/",
+    "content/vo/",
+    "content/volume_types/",
+    "content/weapons/",
+    "packages/boot_assets/",
+    "packages/content/",
+    "packages/game_scripts/",
+    "packages/strings/",
+    "packages/ui/",
+    "wwise/events/",
+    "wwise/packages/",
+    "wwise/world_sound_fx/",
+];
+
+fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHandle<()> {
+    std::thread::spawn(move || {
+        let mut writer = std::io::stderr();
+        let mut total_count = 0;
+        let mut total_found = 0;
+
+        let start = Instant::now();
+
+        while let Ok((count, found)) = rx.recv() {
+            total_count += count;
+            total_found += found;
+
+            let dur = Instant::now() - start;
+            if dur.as_secs() > 1 {
+                let s = format!("\r{total_count} per second | {total_found:6}/{hash_count} found",);
+
+                //     let s = String::from_utf8_lossy(&buf);
+                //     // The last prefix in the set is the one that will stay in the buffer
+                //     // when we're about to print here.
+                //     // So we strip that, to show just the generated part.
+                //     // We also restrict the length to stay on a single line.
+                //     let prefix_len = prefixes[28].len();
+                //     let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
+                //         .trim_end()
+                //         .to_string();
+
+                writer.write_all(s.as_bytes()).unwrap();
+
+                total_count = 0;
+            }
+        }
+    })
+}
+
+fn make_stdout_printer(rx: Receiver<Vec<u8>>) -> JoinHandle<()> {
+    std::thread::spawn(move || {
+        let mut writer = std::io::stdout();
+
+        while let Ok(buf) = rx.recv() {
+            writer.write_all(&buf).unwrap();
+        }
+    })
+}
+
+struct State {
+    delimiter_lists: Arc<Vec<Vec<String>>>,
+    hashes: Arc<HashSet<Murmur64>>,
+    words: Arc<Vec<String>>,
+    delimiters_len: usize,
+    stdout_tx: Sender<Vec<u8>>,
+    info_tx: Sender<(usize, usize)>,
+}
+
+fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
+    std::thread::spawn(move || {
+        let delimiter_lists = &state.delimiter_lists;
+        let hashes = &state.hashes;
+        let words = &state.words;
+        let delimiters_len = state.delimiters_len;
+
+        let mut count = 0;
+        let mut found = 0;
+        let mut buf = Vec::with_capacity(1024);
+
+        // while let Some(indices) = find_task(local, global, &[]) {
+        while let Ok(indices) = rx.recv() {
+            let sequence = indices.iter().map(|i| words[*i].as_str());
+
+            // We only want delimiters between words, so we keep that iterator shorter by
+            // one.
+            let delimiter_count = sequence.len() as u32 - 1;
+
+            for prefix in PREFIXES.iter().map(|p| p.as_bytes()) {
+                buf.clear();
+
+                // We can keep the prefix at the front of the buffer and only
+                // replace the parts after that.
+                let prefix_len = prefix.len();
+                buf.extend_from_slice(prefix);
+
+                for delims in delimiter_lists
+                    .iter()
+                    .take(delimiters_len.pow(delimiter_count))
+                {
+                    buf.truncate(prefix_len);
+
+                    let delims = delims
+                        .iter()
+                        .map(|s| s.as_str())
+                        .take(delimiter_count as usize);
+                    sequence
+                        .clone()
+                        .interleave(delims.clone())
+                        .for_each(|word| buf.extend_from_slice(word.as_bytes()));
+
+                    count += 1;
+
+                    let hash = Murmur64::hash(&buf);
+                    if hashes.contains(&hash) {
+                        found += 1;
+
+                        buf.push(LINE_FEED);
+                        if let Err(_) = state.stdout_tx.send(buf.clone()) {
+                            return;
+                        }
+                    } else {
+                        let word_len = buf.len();
+
+                        // If the regular word itself didn't match, we check
+                        // for numbered suffixes.
+                        // For now, we only check up to `09` to avoid more complex logic
+                        // writing into the buffer.
+                        // Packages that contain files with higher numbers than this
+                        // should hopefully become easier to spot once a good number of
+                        // hashes is found.
+                        for i in 1..=9 {
+                            buf.truncate(word_len);
+                            buf.push(UNDERSCORE);
+                            buf.push(ZERO);
+                            buf.push(ZERO + i);
+
+                            count += 1;
+
+                            let hash = Murmur64::hash(&buf);
+                            if hashes.contains(&hash) {
+                                found += 1;
+
+                                buf.push(LINE_FEED);
+                                if let Err(_) = state.stdout_tx.send(buf.clone()) {
+                                    return;
+                                }
+                            } else {
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            if count >= 1024 * 1024 {
+                let _ = state.info_tx.send((count, found));
+            }
+
+            // let dur = Instant::now() - start;
+            // if dur.as_secs() >= 1 {
+            //     let hashes_len = hashes.len();
+            //     let s = String::from_utf8_lossy(&buf);
+            //     // The last prefix in the set is the one that will stay in the buffer
+            //     // when we're about to print here.
+            //     // So we strip that, to show just the generated part.
+            //     // We also restrict the length to stay on a single line.
+            //     let prefix_len = prefixes[28].len();
+            //     let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
+            //         .trim_end()
+            //         .to_string();
+            //     info_tx.send(format!(
+            //         "\r{:8} hashes per second | {:6}/{} found | {:<60}",
+            //         count, found, hashes_len, s
+            //     ));
+
+            //     start = Instant::now();
+            //     count = 0;
+            // }
+        }
+    })
+}
+
+fn build_delimiter_lists(delimiters: impl AsRef<[String]>, max_length: usize) -> Vec<Vec<String>> {
+    let delimiters = delimiters.as_ref();
+    let mut indices = vec![0; max_length];
+    let mut list = Vec::new();
+
+    for _ in 0..delimiters.len().pow(max_length as u32) {
+        list.push(
+            indices
+                .iter()
+                .map(|i| delimiters[*i].clone())
+                .collect::<Vec<_>>(),
+        );
+
+        for v in indices.iter_mut() {
+            if *v >= delimiters.len() - 1 {
+                *v = 0;
+                break;
+            } else {
+                *v += 1;
+            }
+        }
+    }
+
+    list
+}
+
+fn build_initial_indices(
+    cont: Option<&String>,
+    delimiters: impl AsRef<[String]>,
+    words: impl AsRef<[String]>,
+) -> Result<Vec<usize>> {
+    if let Some(cont) = cont {
+        let mut splits = vec![cont.clone()];
+
+        for delim in delimiters.as_ref().iter() {
+            splits = splits
+                .iter()
+                .flat_map(|s| s.split(delim))
+                .map(|s| s.to_string())
+                .collect();
+        }
+
+        let indices = splits
+            .into_iter()
+            .map(|s| {
+                words
+                    .as_ref()
+                    .iter()
+                    .enumerate()
+                    .find(|(_, v)| s == **v)
+                    .map(|(i, _)| i)
+                    .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
+            })
+            .collect::<Result<_>>()?;
+
+        tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
+
+        Ok(indices)
+    } else {
+        Ok(vec![0])
+    }
+}
+
 #[tracing::instrument(skip_all)]
 #[allow(clippy::mut_range_bound)]
-pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
+pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     let max_length: usize = matches
         .get_one::<usize>("max-length")
         .copied()
         .expect("parameter has default");
 
-    let words: Vec<String> = {
+    let num_threads: usize = matches
+        .get_one::<usize>("threads")
+        .copied()
+        .expect("parameter has default");
+
+    let words = {
         let path = matches
             .get_one::<PathBuf>("words")
             .expect("missing required parameter");
 
-        let file = fs::read_to_string(&path)
-            .await
+        let file = fs::read_to_string(path)
             .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
 
-        file.lines().map(str::to_string).collect()
-    };
+        let words: Vec<_> = file.lines().map(str::to_string).collect();
 
-    if words.is_empty() {
-        eyre::bail!("Word list must not be empty");
-    }
+        if words.is_empty() {
+            eyre::bail!("Word list must not be empty");
+        }
+
+        Arc::new(words)
+    };
 
     let hashes = {
         let path = matches
             .get_one::<PathBuf>("hashes")
             .expect("missing required argument");
-        let content = fs::read_to_string(&path)
-            .await
+        let content = fs::read_to_string(path)
             .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
 
         let hashes: Result<HashSet<_>, _> = content
@@ -116,7 +398,7 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
 
         tracing::trace!("{:?}", hashes);
 
-        hashes
+        Arc::new(hashes)
     };
 
     let mut delimiters: Vec<String> = matches
@@ -132,38 +414,6 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
 
     let delimiters_len = delimiters.len();
 
-    let prefixes = [
-        "",
-        "content/characters/",
-        "content/debug/",
-        "content/decals/",
-        "content/environment/",
-        "content/fx/",
-        "content/gizmos/",
-        "content/items/",
-        "content/levels/",
-        "content/liquid_area/",
-        "content/localization/",
-        "content/materials/",
-        "content/minion_impact_assets/",
-        "content/pickups/",
-        "content/shading_environments/",
-        "content/textures/",
-        "content/ui/",
-        "content/videos/",
-        "content/vo/",
-        "content/volume_types/",
-        "content/weapons/",
-        "packages/boot_assets/",
-        "packages/content/",
-        "packages/game_scripts/",
-        "packages/strings/",
-        "packages/ui/",
-        "wwise/events/",
-        "wwise/packages/",
-        "wwise/world_sound_fx/",
-    ];
-
     let word_count = words.len();
     tracing::info!("{} words to try", word_count);
 
@@ -175,56 +425,43 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
     // So we basically have to implement a smaller version of the iterative algorithm we use later on
     // to build permutations of the actual words.
     let delimiter_lists = {
-        let mut indices = vec![0; max_length - 1];
-        let mut list = Vec::new();
-
-        for _ in 0..delimiters_len.pow(max_length as u32 - 1) {
-            list.push(indices.iter().map(|i| &delimiters[*i]).collect::<Vec<_>>());
-
-            for v in indices.iter_mut() {
-                if *v >= delimiters_len - 1 {
-                    *v = 0;
-                    break;
-                } else {
-                    *v += 1;
-                }
-            }
-        }
-
-        list
+        let lists = build_delimiter_lists(&delimiters, max_length - 1);
+        Arc::new(lists)
     };
-
     tracing::debug!("{:?}", delimiter_lists);
 
-    let mut indices = if let Some(cont) = matches.get_one::<String>("continue").cloned() {
-        let mut splits = vec![cont.clone()];
+    let (info_tx, info_rx) = unbounded();
+    let (stdout_tx, stdout_rx) = unbounded::<Vec<u8>>();
+    let (task_tx, task_rx) = bounded::<Vec<usize>>(100);
+    let mut handles = Vec::new();
 
-        for delim in delimiters.iter() {
-            splits = splits
-                .iter()
-                .flat_map(|s| s.split(delim))
-                .map(|s| s.to_string())
-                .collect();
-        }
+    for _ in 0..num_threads {
+        let handle = make_worker(
+            task_rx.clone(),
+            State {
+                delimiter_lists: Arc::clone(&delimiter_lists),
+                hashes: Arc::clone(&hashes),
+                words: Arc::clone(&words),
+                delimiters_len,
+                stdout_tx: stdout_tx.clone(),
+                info_tx: info_tx.clone(),
+            },
+        );
+        handles.push(handle);
+    }
+    // These are only used inside the worker threads, but due to the loops above, we had to
+    // clone them one too many times.
+    // So we drop that extra reference immediately, to ensure that the channels can
+    // disconnect properly when the threads finish.
+    drop(stdout_tx);
+    drop(info_tx);
 
-        let indices = splits
-            .into_iter()
-            .map(|s| {
-                words
-                    .iter()
-                    .enumerate()
-                    .find(|(_, v)| s == **v)
-                    .map(|(i, _)| i)
-                    .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
-            })
-            .collect::<Result<_>>()?;
+    // handles.push(make_info_printer(info_rx, hashes.len()));
+    handles.push(make_stdout_printer(stdout_rx));
 
-        tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
-
-        indices
-    } else {
-        vec![0]
-    };
+    let mut indices =
+        build_initial_indices(matches.get_one::<String>("continue"), &delimiters, &*words)
+            .wrap_err("Failed to build initial indices")?;
     let mut indices_len = indices.len();
     let mut sequence = indices
         .iter()
@@ -235,113 +472,8 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
     indices.reserve(max_length);
     sequence.reserve(max_length);
 
-    let mut count: usize = 0;
-    let mut found: usize = 0;
-    let mut start = Instant::now();
-
-    // let mut writer = BufWriter::new(tokio::io::stdout());
-    let mut writer = tokio::io::stdout();
-    let mut buf = Vec::with_capacity(1024);
-
-    const LINE_FEED: u8 = 0x0A;
-    const UNDERSCORE: u8 = 0x5F;
-    const ZERO: u8 = 0x30;
-
     'outer: loop {
-        // We only want delimiters between words, so we keep that iterator shorter by
-        // one.
-        let delimiter_count = sequence.len() as u32 - 1;
-
-        for prefix in prefixes.iter().map(|p| p.as_bytes()) {
-            buf.clear();
-
-            // We can keep the prefix at the front of the buffer and only
-            // replace the parts after that.
-            let prefix_len = prefix.len();
-            buf.extend_from_slice(prefix);
-
-            for delims in delimiter_lists
-                .iter()
-                .take(delimiters_len.pow(delimiter_count))
-            {
-                buf.truncate(prefix_len);
-
-                let delims = delims
-                    .iter()
-                    .map(|s| s.as_str())
-                    .take(delimiter_count as usize);
-                sequence
-                    .iter()
-                    .copied()
-                    .interleave(delims.clone())
-                    .for_each(|word| buf.extend_from_slice(word.as_bytes()));
-
-                count += 1;
-
-                let hash = Murmur64::hash(&buf);
-                if hashes.contains(&hash) {
-                    found += 1;
-                    buf.push(LINE_FEED);
-                    writer.write_all(&buf).await?;
-                } else {
-                    let word_len = buf.len();
-
-                    // If the regular word itself didn't match, we check
-                    // for numbered suffixes.
-                    // For now, we only check up to `09` to avoid more complex logic
-                    // writing into the buffer.
-                    // Packages that contain files with higher numbers than this
-                    // should hopefully become easier to spot once a good number of
-                    // hashes is found.
-                    for i in 1..=9 {
-                        buf.truncate(word_len);
-                        buf.push(UNDERSCORE);
-                        buf.push(ZERO);
-                        buf.push(ZERO + i);
-
-                        count += 1;
-
-                        let hash = Murmur64::hash(&buf);
-                        if hashes.contains(&hash) {
-                            found += 1;
-                            buf.push(LINE_FEED);
-                            writer.write_all(&buf).await?;
-                        } else {
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        let dur = Instant::now() - start;
-        if dur.as_secs() >= 1 {
-            let hashes_len = hashes.len();
-            let s = String::from_utf8_lossy(&buf);
-            // The last prefix in the set is the one that will stay in the buffer
-            // when we're about to print here.
-            // So we strip that, to show just the generated part.
-            // We also restrict the length to stay on a single line.
-            let prefix_len = prefixes[28].len();
-            let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
-                .trim_end()
-                .to_string();
-            // Don't care when it finishes, don't care if it fails.
-            tokio::spawn(async move {
-                let _ = tokio::io::stderr()
-                    .write_all(
-                        format!(
-                            "\r{:8} hashes per second | {:6}/{} found | {:<60}",
-                            count, found, hashes_len, s
-                        )
-                        .as_bytes(),
-                    )
-                    .await;
-            });
-
-            start = Instant::now();
-            count = 0;
-        }
+        task_tx.send(indices.clone())?;
 
         for i in 0..indices_len {
             let index = indices.get_mut(i).unwrap();
@@ -371,5 +503,25 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         }
     }
 
+    // Dropping the senders will disconnect the channel,
+    // so that the threads holding the other end will eventually
+    // complete as well.
+    drop(task_tx);
+
+    tracing::debug!("Wainting for workers to finish.");
+
+    for handle in handles {
+        match handle.join() {
+            Ok(_) => {}
+            Err(value) => {
+                if let Some(err) = value.downcast_ref::<String>() {
+                    eyre::bail!("Thread failed: {}", err);
+                } else {
+                    eyre::bail!("Thread failed with unknown error: {:?}", value);
+                }
+            }
+        }
+    }
+
     Ok(())
 }
diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs
index 9ceb3b9..c53d9b5 100644
--- a/crates/dtmt/src/cmd/experiment/mod.rs
+++ b/crates/dtmt/src/cmd/experiment/mod.rs
@@ -15,7 +15,9 @@ pub(crate) fn command_definition() -> Command {
 #[tracing::instrument(skip_all)]
 pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     match matches.subcommand() {
-        Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches).await,
+        // It's fine to block here, as this is the only thing that's executing on the runtime.
+        // The other option with `spawn_blocking` would require setting up values to be Send+Sync.
+        Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches),
         Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
         _ => unreachable!(
             "clap is configured to require a subcommand, and they're all handled above"

From 64493547143f43b90e405a9ef98fe31066de93a9 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Tue, 19 Sep 2023 16:15:22 +0200
Subject: [PATCH 08/10] sdk: Reimplement logging current word

---
 .../src/cmd/experiment/brute_force_words.rs   | 86 ++++++++-----------
 1 file changed, 36 insertions(+), 50 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index aa15003..4bf8556 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::fs;
-use std::io::Write;
+use std::io::{BufWriter, Write};
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::thread::JoinHandle;
@@ -122,35 +122,30 @@ const PREFIXES: [&str; 29] = [
     "wwise/world_sound_fx/",
 ];
 
-fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHandle<()> {
+fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> {
     std::thread::spawn(move || {
         let mut writer = std::io::stderr();
         let mut total_count = 0;
         let mut total_found = 0;
 
-        let start = Instant::now();
+        let mut start = Instant::now();
 
-        while let Ok((count, found)) = rx.recv() {
+        while let Ok((count, found, last)) = rx.recv() {
             total_count += count;
             total_found += found;
 
-            let dur = Instant::now() - start;
-            if dur.as_secs() > 1 {
-                let s = format!("\r{total_count} per second | {total_found:6}/{hash_count} found",);
-
-                //     let s = String::from_utf8_lossy(&buf);
-                //     // The last prefix in the set is the one that will stay in the buffer
-                //     // when we're about to print here.
-                //     // So we strip that, to show just the generated part.
-                //     // We also restrict the length to stay on a single line.
-                //     let prefix_len = prefixes[28].len();
-                //     let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
-                //         .trim_end()
-                //         .to_string();
+            let now = Instant::now();
+            if (now - start).as_millis() > 250 {
+                let s = &last[0..std::cmp::min(last.len(), 60)];
+                let s = format!(
+                    "\r{:12} per second | {total_found:6}/{hash_count} found | {s:<60}",
+                    total_count * 4
+                );
 
                 writer.write_all(s.as_bytes()).unwrap();
 
                 total_count = 0;
+                start = now;
             }
         }
     })
@@ -158,7 +153,7 @@ fn make_info_printer(rx: Receiver<(usize, usize)>, hash_count: usize) -> JoinHan
 
 fn make_stdout_printer(rx: Receiver<Vec<u8>>) -> JoinHandle<()> {
     std::thread::spawn(move || {
-        let mut writer = std::io::stdout();
+        let mut writer = BufWriter::new(std::io::stdout());
 
         while let Ok(buf) = rx.recv() {
             writer.write_all(&buf).unwrap();
@@ -172,7 +167,7 @@ struct State {
     words: Arc<Vec<String>>,
     delimiters_len: usize,
     stdout_tx: Sender<Vec<u8>>,
-    info_tx: Sender<(usize, usize)>,
+    info_tx: Sender<(usize, usize, String)>,
 }
 
 fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
@@ -186,7 +181,6 @@ fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
         let mut found = 0;
         let mut buf = Vec::with_capacity(1024);
 
-        // while let Some(indices) = find_task(local, global, &[]) {
         while let Ok(indices) = rx.recv() {
             let sequence = indices.iter().map(|i| words[*i].as_str());
 
@@ -224,7 +218,7 @@ fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
                         found += 1;
 
                         buf.push(LINE_FEED);
-                        if let Err(_) = state.stdout_tx.send(buf.clone()) {
+                        if state.stdout_tx.send(buf.clone()).is_err() {
                             return;
                         }
                     } else {
@@ -250,7 +244,7 @@ fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
                                 found += 1;
 
                                 buf.push(LINE_FEED);
-                                if let Err(_) = state.stdout_tx.send(buf.clone()) {
+                                if state.stdout_tx.send(buf.clone()).is_err() {
                                     return;
                                 }
                             } else {
@@ -261,30 +255,22 @@ fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
                 }
             }
 
-            if count >= 1024 * 1024 {
-                let _ = state.info_tx.send((count, found));
+            if count >= 2 * 1024 * 1024 {
+                // The last prefix in the set is the one that will stay in the buffer
+                // when we're about to print here.
+                // So we strip that, to show just the generated part.
+                // We also restrict the length to stay on a single line.
+                let prefix_len = PREFIXES[28].len();
+                // No need to wait for this
+                let _ = state.info_tx.try_send((
+                    count,
+                    found,
+                    String::from_utf8_lossy(&buf[prefix_len..]).to_string(),
+                ));
+
+                count = 0;
+                found = 0;
             }
-
-            // let dur = Instant::now() - start;
-            // if dur.as_secs() >= 1 {
-            //     let hashes_len = hashes.len();
-            //     let s = String::from_utf8_lossy(&buf);
-            //     // The last prefix in the set is the one that will stay in the buffer
-            //     // when we're about to print here.
-            //     // So we strip that, to show just the generated part.
-            //     // We also restrict the length to stay on a single line.
-            //     let prefix_len = prefixes[28].len();
-            //     let s = s[prefix_len..std::cmp::min(s.len(), prefix_len + 60)]
-            //         .trim_end()
-            //         .to_string();
-            //     info_tx.send(format!(
-            //         "\r{:8} hashes per second | {:6}/{} found | {:<60}",
-            //         count, found, hashes_len, s
-            //     ));
-
-            //     start = Instant::now();
-            //     count = 0;
-            // }
         }
     })
 }
@@ -430,9 +416,9 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     };
     tracing::debug!("{:?}", delimiter_lists);
 
-    let (info_tx, info_rx) = unbounded();
+    let (info_tx, info_rx) = bounded(100);
     let (stdout_tx, stdout_rx) = unbounded::<Vec<u8>>();
-    let (task_tx, task_rx) = bounded::<Vec<usize>>(100);
+    let (task_tx, task_rx) = bounded::<Vec<usize>>(num_threads * 4);
     let mut handles = Vec::new();
 
     for _ in 0..num_threads {
@@ -456,7 +442,7 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     drop(stdout_tx);
     drop(info_tx);
 
-    // handles.push(make_info_printer(info_rx, hashes.len()));
+    handles.push(make_info_printer(info_rx, hashes.len()));
     handles.push(make_stdout_printer(stdout_rx));
 
     let mut indices =
@@ -508,8 +494,6 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
     // complete as well.
     drop(task_tx);
 
-    tracing::debug!("Wainting for workers to finish.");
-
     for handle in handles {
         match handle.join() {
             Ok(_) => {}
@@ -523,5 +507,7 @@ pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
         }
     }
 
+    let _ = std::io::stdout().write_all("\r".as_bytes());
+
     Ok(())
 }

From 6ada4c1c43298c11593422b0c0aa2330738ae5fe Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Wed, 20 Sep 2023 11:33:24 +0200
Subject: [PATCH 09/10] sdk: Add additional brute force prefixes

---
 crates/dtmt/src/cmd/experiment/brute_force_words.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
index 4bf8556..26d887f 100644
--- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs
+++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs
@@ -90,13 +90,14 @@ const LINE_FEED: u8 = 0x0A;
 const UNDERSCORE: u8 = 0x5F;
 const ZERO: u8 = 0x30;
 
-const PREFIXES: [&str; 29] = [
+const PREFIXES: [&str; 36] = [
     "",
     "content/characters/",
     "content/debug/",
     "content/decals/",
     "content/environment/",
     "content/fx/",
+    "content/fx/particles/",
     "content/gizmos/",
     "content/items/",
     "content/levels/",
@@ -112,14 +113,20 @@ const PREFIXES: [&str; 29] = [
     "content/vo/",
     "content/volume_types/",
     "content/weapons/",
+    "content/",
+    "core/",
+    "core/units/",
     "packages/boot_assets/",
     "packages/content/",
     "packages/game_scripts/",
     "packages/strings/",
     "packages/ui/",
+    "packages/",
     "wwise/events/",
     "wwise/packages/",
     "wwise/world_sound_fx/",
+    "wwise/events/weapons/",
+    "wwise/events/minions/",
 ];
 
 fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> {
@@ -153,7 +160,7 @@ fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) ->
 
 fn make_stdout_printer(rx: Receiver<Vec<u8>>) -> JoinHandle<()> {
     std::thread::spawn(move || {
-        let mut writer = BufWriter::new(std::io::stdout());
+        let mut writer = std::io::stdout();
 
         while let Ok(buf) = rx.recv() {
             writer.write_all(&buf).unwrap();
@@ -260,7 +267,7 @@ fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
                 // when we're about to print here.
                 // So we strip that, to show just the generated part.
                 // We also restrict the length to stay on a single line.
-                let prefix_len = PREFIXES[28].len();
+                let prefix_len = PREFIXES[35].len();
                 // No need to wait for this
                 let _ = state.info_tx.try_send((
                     count,

From ae1e7e5aa6e8c0ca54da5a485cad9a2e0e64b869 Mon Sep 17 00:00:00 2001
From: Lucas Schwiderski <lucas@lschwiderski.de>
Date: Fri, 22 Sep 2023 11:46:57 +0200
Subject: [PATCH 10/10] dtmt: Add word extraction algorithm for paths

---
 .../dtmt/src/cmd/experiment/extract_words.rs  | 311 +++++++++++++++++-
 1 file changed, 296 insertions(+), 15 deletions(-)

diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs
index 512038d..1a8cda5 100644
--- a/crates/dtmt/src/cmd/experiment/extract_words.rs
+++ b/crates/dtmt/src/cmd/experiment/extract_words.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::HashMap;
 use std::path::PathBuf;
 
 use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
@@ -36,7 +36,7 @@ pub(crate) fn command_definition() -> Command {
         )
 }
 
-#[derive(Copy, Clone, Debug, ValueEnum)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
 #[value(rename_all = "snake_case")]
 enum Algorithm {
     Alphabetic,
@@ -45,6 +45,7 @@ enum Algorithm {
     Number,
     Hash32,
     Hash64,
+    Paths,
 }
 
 impl Algorithm {
@@ -55,6 +56,8 @@ impl Algorithm {
             Self::Identifier => c.is_ascii_alphabetic(),
             Self::Number => c.is_numeric(),
             Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
+            // Supposed to be handled separately
+            Self::Paths => false,
         }
     }
 
@@ -65,6 +68,8 @@ impl Algorithm {
             Self::Identifier => c.is_ascii_alphanumeric(),
             Self::Number => c.is_numeric(),
             Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
+            // Supposed to be handled separately
+            Self::Paths => false,
         }
     }
 
@@ -76,6 +81,8 @@ impl Algorithm {
             Self::Number => true,
             Self::Hash32 => len == 8,
             Self::Hash64 => len == 16,
+            // Supposed to be handled separately
+            Self::Paths => false,
         }
     }
 }
@@ -92,11 +99,274 @@ impl std::fmt::Display for Algorithm {
                 Algorithm::Number => "number",
                 Algorithm::Hash32 => "hash32",
                 Algorithm::Hash64 => "hash64",
+                Algorithm::Paths => "paths",
             }
         )
     }
 }
 
+#[derive(Copy, Clone, Debug)]
+enum PathState {
+    Begin,
+    PathComponent,
+    PathSeparator,
+    Boundary,
+    NonWord,
+    End,
+}
+
+#[tracing::instrument(skip(chars))]
+fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
+    let mut chars = chars.peekable();
+
+    let mut state = PathState::Begin;
+    let mut list = Vec::new();
+    let mut path = Vec::new();
+    let mut word = String::new();
+
+    let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
+
+    'machine: loop {
+        state = match state {
+            PathState::Begin => match chars.next() {
+                None => PathState::End,
+                Some(c) if c.is_ascii_alphabetic() => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                Some('/') => PathState::PathSeparator,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::PathComponent => match chars.next() {
+                None => {
+                    path.push(word.clone());
+                    list.push(path.clone());
+
+                    PathState::End
+                }
+                Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some('/') => {
+                    path.push(word.clone());
+                    word.clear();
+
+                    PathState::PathSeparator
+                }
+                Some(c) if is_boundary(c) => {
+                    path.push(word.clone());
+                    list.push(path.clone());
+
+                    path.clear();
+                    word.clear();
+
+                    PathState::Boundary
+                }
+                Some(_) => {
+                    list.push(path.clone());
+
+                    path.clear();
+                    word.clear();
+
+                    PathState::NonWord
+                }
+            },
+            PathState::PathSeparator => match chars.next() {
+                None => {
+                    list.push(path.clone());
+                    PathState::End
+                }
+                Some('/') => PathState::PathSeparator,
+                Some(c) if c.is_ascii_alphabetic() || c == '_' => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some(c) if is_boundary(c) => {
+                    list.push(path.clone());
+                    path.clear();
+                    PathState::Boundary
+                }
+                Some(_) => {
+                    list.push(path.clone());
+                    path.clear();
+                    PathState::NonWord
+                }
+            },
+            PathState::Boundary => match chars.next() {
+                None => PathState::End,
+                Some(c) if c.is_ascii_alphabetic() => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::NonWord => match chars.next() {
+                None => PathState::End,
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::End => {
+                break 'machine;
+            }
+        }
+    }
+
+    list
+}
+
+#[tracing::instrument(skip(chars))]
+fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
+    let mut chars = chars.peekable();
+
+    let mut state = PathState::Begin;
+    let mut word = String::new();
+    let mut lists = vec![HashMap::<String, usize>::new()];
+    let mut index = 0;
+
+    let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
+
+    'machine: loop {
+        state = match state {
+            PathState::Begin => match chars.next() {
+                None => PathState::End,
+                Some(c) if c.is_ascii_alphabetic() => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                // Ignore leading path separators to not trigger the logic of advancing
+                // the component count
+                Some('/') => PathState::Boundary,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::PathComponent => match chars.next() {
+                None => PathState::End,
+                Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some('/') => PathState::PathSeparator,
+                Some(c) => {
+                    if index > 0 && word.len() >= min_length {
+                        let list = &mut lists[index];
+                        list.entry(word.clone())
+                            .and_modify(|count| *count += 1)
+                            .or_insert(1);
+                    }
+                    word.clear();
+
+                    index = 0;
+
+                    if is_boundary(c) {
+                        PathState::Boundary
+                    } else {
+                        PathState::NonWord
+                    }
+                }
+            },
+            PathState::PathSeparator => {
+                if word.len() >= min_length {
+                    let list = &mut lists[index];
+                    list.entry(word.clone())
+                        .and_modify(|count| *count += 1)
+                        .or_insert(1);
+                }
+                word.clear();
+
+                index += 1;
+                if lists.get(index).is_none() {
+                    lists.push(HashMap::new());
+                }
+
+                // Ignore multiple separators
+                while chars.next_if(|c| *c == '/').is_some() {}
+
+                match chars.next() {
+                    None => PathState::End,
+                    Some(c) if c.is_ascii_alphabetic() || c == '_' => {
+                        word.push(c);
+                        PathState::PathComponent
+                    }
+                    Some(c) if is_boundary(c) => {
+                        index = 0;
+                        PathState::Boundary
+                    }
+                    Some(_) => {
+                        index = 0;
+                        PathState::NonWord
+                    }
+                }
+            }
+            PathState::Boundary => match chars.next() {
+                None => PathState::End,
+                Some(c) if c.is_ascii_alphabetic() => {
+                    word.push(c);
+                    PathState::PathComponent
+                }
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::NonWord => match chars.next() {
+                None => PathState::End,
+                Some(c) if is_boundary(c) => PathState::Boundary,
+                Some(_) => PathState::NonWord,
+            },
+            PathState::End => {
+                if word.len() >= min_length {
+                    let list = &mut lists[index];
+                    list.entry(word.clone())
+                        .and_modify(|count| *count += 1)
+                        .or_insert(1);
+                }
+
+                break 'machine;
+            }
+        }
+    }
+
+    for i in 0..lists.len() {
+        print!("Word {i}, Count {i},");
+    }
+    println!();
+
+    let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
+
+    for (i, list) in lists.into_iter().enumerate() {
+        let mut entries = list.into_iter().collect::<Vec<_>>();
+        entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
+
+        for (j, (word, count)) in entries.into_iter().enumerate() {
+            if let Some(line) = lines.get_mut(j) {
+                while line.len() < i {
+                    line.push(None);
+                }
+                line.push(Some((word, count)));
+            } else {
+                let mut line = Vec::new();
+                while line.len() < i {
+                    line.push(None);
+                }
+                line.push(Some((word, count)));
+                lines.push(line);
+            }
+        }
+    }
+
+    for line in lines.iter() {
+        for cell in line.iter() {
+            if let Some((word, count)) = cell {
+                print!("{},{},", word, count);
+            } else {
+                print!(",,");
+            }
+        }
+        println!();
+    }
+}
+
 #[derive(Copy, Clone, Debug)]
 enum State {
     Begin,
@@ -125,9 +395,14 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
     let mut chars = content.chars();
 
+    if *algorithm == Algorithm::Paths {
+        algorithm_path_components(chars, min_length);
+        return Ok(());
+    }
+
     let mut state = State::Begin;
     let mut word = String::new();
-    let mut visited = HashSet::new();
+    let mut visited = HashMap::new();
 
     'machine: loop {
         state = match state {
@@ -150,12 +425,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
             },
             State::Word => match chars.next() {
                 None => {
-                    if word.len() >= min_length
-                        && algorithm.is_length(word.len())
-                        && !visited.contains(&word)
-                    {
-                        println!("{}", &word);
-                        visited.insert(word.clone());
+                    if word.len() >= min_length && algorithm.is_length(word.len()) {
+                        visited
+                            .entry(word.clone())
+                            .and_modify(|v| *v += 1)
+                            .or_insert(1);
                     }
                     State::End
                 }
@@ -164,12 +438,11 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
                     State::Word
                 }
                 Some(_) => {
-                    if word.len() >= min_length
-                        && algorithm.is_length(word.len())
-                        && !visited.contains(&word)
-                    {
-                        println!("{}", &word);
-                        visited.insert(word.clone());
+                    if word.len() >= min_length && algorithm.is_length(word.len()) {
+                        visited
+                            .entry(word.clone())
+                            .and_modify(|v| *v += 1)
+                            .or_insert(1);
                     }
                     word.clear();
                     State::NonWord
@@ -178,5 +451,13 @@ pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()>
         }
     }
 
+    let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
+    // Reverse sides during comparison to get "highest to lowest"
+    entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
+
+    entries
+        .iter()
+        .for_each(|(word, count)| println!("{:016} {}", word, count));
+
     Ok(())
 }