diff --git a/Cargo.lock b/Cargo.lock index a251de9..3a02b55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -161,6 +161,17 @@ dependencies = [ "system-deps", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.3.0" @@ -212,7 +223,7 @@ dependencies = [ "bitflags 2.5.0", "cexpr", "clang-sys", - "itertools", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -647,6 +658,20 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + [[package]] name = "crossbeam-channel" version = "0.5.12" @@ -656,6 +681,40 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset 0.9.1", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -927,15 +986,18 @@ name = "dtmt" version = "0.3.0" dependencies = [ "async-recursion", + "atty", "clap", "cli-table", "color-eyre", "confy", + "crossbeam", "csv-async", "dtmt-shared", "futures", "futures-util", "glob", + "itertools 0.11.0", "luajit2-sys", "nanorand", "notify", @@ -1598,6 +1660,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.9" @@ -1858,6 +1929,15 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -2267,7 +2347,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index d836a50..e80feaa 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -33,6 +33,9 @@ async-recursion = "1.0.2" notify = "6.1.1" luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } shlex = { version = "1.2.0", optional = true } +atty = "0.2.14" +itertools = "0.11.0" +crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] } [dev-dependencies] tempfile = "3.3.0" diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs new file mode 100644 index 0000000..26d887f --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/brute_force_words.rs @@ -0,0 +1,520 @@ +use std::collections::HashSet; +use std::fs; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; +use std::sync::Arc; +use std::thread::JoinHandle; + +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use color_eyre::eyre::{self, Context}; +use color_eyre::Result; +use crossbeam::channel::{bounded, unbounded, Receiver, Sender}; +use itertools::Itertools; +use sdk::murmur::Murmur64; +use tokio::time::Instant; + +pub(crate) fn command_definition() -> Command { + Command::new("brute-force-words") + .about( + "Given a list of words and a set of delimiters, iteratevily creates permutations \ + of growing length.\n\ + Delimiters are placed between every word in the result.\n\n\ + Example: \ + Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \ + words will be\n\ + - packages\n\ + - boot\n\ + - packages/packages\n\ + - packages_packages\n\ + - packages/boot\n\ + - packages_boot\n\ + - boot/packages\n\ + - boot_packages\n\ + - boot/boot\n\ + - boot_boot", + ) + .arg( + Arg::new("delimiter") + .help( + "The delimiters to put between the words. \ + All permutations of this list will be tried for every string of words.\n\ + Specify multiple times to set multiple values.\n\ + Defaults to ['/', '_'].", + ) + .short('d') + .long("delimiter") + .action(ArgAction::Append), + ) + .arg( + Arg::new("max-length") + .help("The maximum number of words up to which to build strings.") + .long("max") + .long("max-length") + .short('m') + .default_value("5") + .value_parser(value_parser!(usize)), + ) + .arg( + Arg::new("continue") + .help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.") + .short('c') + .long("continue") + ) + .arg( + Arg::new("threads") + .help("The number of workers to run in parallel.") + .long("threads") + .short('n') + .default_value("6") + .value_parser(value_parser!(usize)) + ) + .arg( + Arg::new("words") + .help("Path to a file containing words line by line.") + .required(true) + .value_parser(value_parser!(PathBuf)), + ) + .arg( + Arg::new("hashes") + .help( + "Path to a file containing the hashes to attempt to brute force. \ + Hashes are expected in hexadecimal notiation. \ + Only 64-bit hashes are supported." + ) + .required(true) + .value_parser(value_parser!(PathBuf)), + ) +} + +const LINE_FEED: u8 = 0x0A; +const UNDERSCORE: u8 = 0x5F; +const ZERO: u8 = 0x30; + +const PREFIXES: [&str; 36] = [ + "", + "content/characters/", + "content/debug/", + "content/decals/", + "content/environment/", + "content/fx/", + "content/fx/particles/", + "content/gizmos/", + "content/items/", + "content/levels/", + "content/liquid_area/", + "content/localization/", + "content/materials/", + "content/minion_impact_assets/", + "content/pickups/", + "content/shading_environments/", + "content/textures/", + "content/ui/", + "content/videos/", + "content/vo/", + "content/volume_types/", + "content/weapons/", + "content/", + "core/", + "core/units/", + "packages/boot_assets/", + "packages/content/", + "packages/game_scripts/", + "packages/strings/", + "packages/ui/", + "packages/", + "wwise/events/", + "wwise/packages/", + "wwise/world_sound_fx/", + "wwise/events/weapons/", + "wwise/events/minions/", +]; + +fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> { + std::thread::spawn(move || { + let mut writer = std::io::stderr(); + let mut total_count = 0; + let mut total_found = 0; + + let mut start = Instant::now(); + + while let Ok((count, found, last)) = rx.recv() { + total_count += count; + total_found += found; + + let now = Instant::now(); + if (now - start).as_millis() > 250 { + let s = &last[0..std::cmp::min(last.len(), 60)]; + let s = format!( + "\r{:12} per second | {total_found:6}/{hash_count} found | {s:<60}", + total_count * 4 + ); + + writer.write_all(s.as_bytes()).unwrap(); + + total_count = 0; + start = now; + } + } + }) +} + +fn make_stdout_printer(rx: Receiver>) -> JoinHandle<()> { + std::thread::spawn(move || { + let mut writer = std::io::stdout(); + + while let Ok(buf) = rx.recv() { + writer.write_all(&buf).unwrap(); + } + }) +} + +struct State { + delimiter_lists: Arc>>, + hashes: Arc>, + words: Arc>, + delimiters_len: usize, + stdout_tx: Sender>, + info_tx: Sender<(usize, usize, String)>, +} + +fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { + std::thread::spawn(move || { + let delimiter_lists = &state.delimiter_lists; + let hashes = &state.hashes; + let words = &state.words; + let delimiters_len = state.delimiters_len; + + let mut count = 0; + let mut found = 0; + let mut buf = Vec::with_capacity(1024); + + while let Ok(indices) = rx.recv() { + let sequence = indices.iter().map(|i| words[*i].as_str()); + + // We only want delimiters between words, so we keep that iterator shorter by + // one. + let delimiter_count = sequence.len() as u32 - 1; + + for prefix in PREFIXES.iter().map(|p| p.as_bytes()) { + buf.clear(); + + // We can keep the prefix at the front of the buffer and only + // replace the parts after that. + let prefix_len = prefix.len(); + buf.extend_from_slice(prefix); + + for delims in delimiter_lists + .iter() + .take(delimiters_len.pow(delimiter_count)) + { + buf.truncate(prefix_len); + + let delims = delims + .iter() + .map(|s| s.as_str()) + .take(delimiter_count as usize); + sequence + .clone() + .interleave(delims.clone()) + .for_each(|word| buf.extend_from_slice(word.as_bytes())); + + count += 1; + + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + + buf.push(LINE_FEED); + if state.stdout_tx.send(buf.clone()).is_err() { + return; + } + } else { + let word_len = buf.len(); + + // If the regular word itself didn't match, we check + // for numbered suffixes. + // For now, we only check up to `09` to avoid more complex logic + // writing into the buffer. + // Packages that contain files with higher numbers than this + // should hopefully become easier to spot once a good number of + // hashes is found. + for i in 1..=9 { + buf.truncate(word_len); + buf.push(UNDERSCORE); + buf.push(ZERO); + buf.push(ZERO + i); + + count += 1; + + let hash = Murmur64::hash(&buf); + if hashes.contains(&hash) { + found += 1; + + buf.push(LINE_FEED); + if state.stdout_tx.send(buf.clone()).is_err() { + return; + } + } else { + break; + } + } + } + } + } + + if count >= 2 * 1024 * 1024 { + // The last prefix in the set is the one that will stay in the buffer + // when we're about to print here. + // So we strip that, to show just the generated part. + // We also restrict the length to stay on a single line. + let prefix_len = PREFIXES[35].len(); + // No need to wait for this + let _ = state.info_tx.try_send(( + count, + found, + String::from_utf8_lossy(&buf[prefix_len..]).to_string(), + )); + + count = 0; + found = 0; + } + } + }) +} + +fn build_delimiter_lists(delimiters: impl AsRef<[String]>, max_length: usize) -> Vec> { + let delimiters = delimiters.as_ref(); + let mut indices = vec![0; max_length]; + let mut list = Vec::new(); + + for _ in 0..delimiters.len().pow(max_length as u32) { + list.push( + indices + .iter() + .map(|i| delimiters[*i].clone()) + .collect::>(), + ); + + for v in indices.iter_mut() { + if *v >= delimiters.len() - 1 { + *v = 0; + break; + } else { + *v += 1; + } + } + } + + list +} + +fn build_initial_indices( + cont: Option<&String>, + delimiters: impl AsRef<[String]>, + words: impl AsRef<[String]>, +) -> Result> { + if let Some(cont) = cont { + let mut splits = vec![cont.clone()]; + + for delim in delimiters.as_ref().iter() { + splits = splits + .iter() + .flat_map(|s| s.split(delim)) + .map(|s| s.to_string()) + .collect(); + } + + let indices = splits + .into_iter() + .map(|s| { + words + .as_ref() + .iter() + .enumerate() + .find(|(_, v)| s == **v) + .map(|(i, _)| i) + .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) + }) + .collect::>()?; + + tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); + + Ok(indices) + } else { + Ok(vec![0]) + } +} + +#[tracing::instrument(skip_all)] +#[allow(clippy::mut_range_bound)] +pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let max_length: usize = matches + .get_one::("max-length") + .copied() + .expect("parameter has default"); + + let num_threads: usize = matches + .get_one::("threads") + .copied() + .expect("parameter has default"); + + let words = { + let path = matches + .get_one::("words") + .expect("missing required parameter"); + + let file = fs::read_to_string(path) + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + + let words: Vec<_> = file.lines().map(str::to_string).collect(); + + if words.is_empty() { + eyre::bail!("Word list must not be empty"); + } + + Arc::new(words) + }; + + let hashes = { + let path = matches + .get_one::("hashes") + .expect("missing required argument"); + let content = fs::read_to_string(path) + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + + let hashes: Result, _> = content + .lines() + .map(|s| u64::from_str_radix(s, 16).map(Murmur64::from)) + .collect(); + + let hashes = hashes?; + + tracing::trace!("{:?}", hashes); + + Arc::new(hashes) + }; + + let mut delimiters: Vec = matches + .get_many::("delimiter") + .unwrap_or_default() + .cloned() + .collect(); + + if delimiters.is_empty() { + delimiters.push(String::from("/")); + delimiters.push(String::from("_")); + } + + let delimiters_len = delimiters.len(); + + let word_count = words.len(); + tracing::info!("{} words to try", word_count); + + // To be able to easily combine the permutations of words and delimiters, + // we turn the latter into a pre-defined list of all permutations of delimiters + // that are possible at the given amount of words. + // Combining `Iterator::cycle` with `Itertools::permutations` works, but + // with a high `max_length`, it runs OOM. + // So we basically have to implement a smaller version of the iterative algorithm we use later on + // to build permutations of the actual words. + let delimiter_lists = { + let lists = build_delimiter_lists(&delimiters, max_length - 1); + Arc::new(lists) + }; + tracing::debug!("{:?}", delimiter_lists); + + let (info_tx, info_rx) = bounded(100); + let (stdout_tx, stdout_rx) = unbounded::>(); + let (task_tx, task_rx) = bounded::>(num_threads * 4); + let mut handles = Vec::new(); + + for _ in 0..num_threads { + let handle = make_worker( + task_rx.clone(), + State { + delimiter_lists: Arc::clone(&delimiter_lists), + hashes: Arc::clone(&hashes), + words: Arc::clone(&words), + delimiters_len, + stdout_tx: stdout_tx.clone(), + info_tx: info_tx.clone(), + }, + ); + handles.push(handle); + } + // These are only used inside the worker threads, but due to the loops above, we had to + // clone them one too many times. + // So we drop that extra reference immediately, to ensure that the channels can + // disconnect properly when the threads finish. + drop(stdout_tx); + drop(info_tx); + + handles.push(make_info_printer(info_rx, hashes.len())); + handles.push(make_stdout_printer(stdout_rx)); + + let mut indices = + build_initial_indices(matches.get_one::("continue"), &delimiters, &*words) + .wrap_err("Failed to build initial indices")?; + let mut indices_len = indices.len(); + let mut sequence = indices + .iter() + .map(|index| words[*index].as_str()) + .collect::>(); + + // Prevent re-allocation by reserving as much as we need upfront + indices.reserve(max_length); + sequence.reserve(max_length); + + 'outer: loop { + task_tx.send(indices.clone())?; + + for i in 0..indices_len { + let index = indices.get_mut(i).unwrap(); + let word = sequence.get_mut(i).unwrap(); + + if *index >= word_count - 1 { + *index = 0; + *word = words[*index].as_str(); + + if indices.get(i + 1).is_none() { + indices_len += 1; + + if indices_len > max_length { + break 'outer; + } + + indices.push(0); + sequence.push(words[0].as_str()); + + break; + } + } else { + *index += 1; + *word = words[*index].as_str(); + break; + } + } + } + + // Dropping the senders will disconnect the channel, + // so that the threads holding the other end will eventually + // complete as well. + drop(task_tx); + + for handle in handles { + match handle.join() { + Ok(_) => {} + Err(value) => { + if let Some(err) = value.downcast_ref::() { + eyre::bail!("Thread failed: {}", err); + } else { + eyre::bail!("Thread failed with unknown error: {:?}", value); + } + } + } + } + + let _ = std::io::stdout().write_all("\r".as_bytes()); + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs new file mode 100644 index 0000000..1a8cda5 --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/extract_words.rs @@ -0,0 +1,463 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; +use color_eyre::eyre::Context; +use color_eyre::Result; +use tokio::fs; + +pub(crate) fn command_definition() -> Command { + Command::new("extract-words") + .about( + "Extract unique alphanumeric sequences that match common identifier rules from the given file. \ + Only ASCII is supported.", + ) + .arg( + Arg::new("file") + .required(true) + .value_parser(value_parser!(PathBuf)) + .help("Path to the file to extract words from."), + ) + .arg( + Arg::new("min-length") + .help("Minimum length to consider a word.") + .long("min-length") + .short('m') + .default_value("3") + .value_parser(value_parser!(usize)) + ) + .arg( + Arg::new("algorithm") + .help("The algorithm to determine matching words") + .long("algorithm") + .short('a') + .default_value("identifier") + .value_parser(value_parser!(Algorithm)) + ) +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)] +#[value(rename_all = "snake_case")] +enum Algorithm { + Alphabetic, + Alphanumeric, + Identifier, + Number, + Hash32, + Hash64, + Paths, +} + +impl Algorithm { + fn is_start(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphabetic(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, + } + } + + fn is_body(&self, c: char) -> bool { + match self { + Self::Alphabetic => c.is_ascii_alphabetic(), + Self::Alphanumeric => c.is_ascii_alphanumeric(), + Self::Identifier => c.is_ascii_alphanumeric(), + Self::Number => c.is_numeric(), + Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + // Supposed to be handled separately + Self::Paths => false, + } + } + + fn is_length(&self, len: usize) -> bool { + match self { + Self::Alphabetic => true, + Self::Alphanumeric => true, + Self::Identifier => true, + Self::Number => true, + Self::Hash32 => len == 8, + Self::Hash64 => len == 16, + // Supposed to be handled separately + Self::Paths => false, + } + } +} + +impl std::fmt::Display for Algorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + Algorithm::Alphabetic => "alphabetic", + Algorithm::Alphanumeric => "alphanumeric", + Algorithm::Identifier => "identifier", + Algorithm::Number => "number", + Algorithm::Hash32 => "hash32", + Algorithm::Hash64 => "hash64", + Algorithm::Paths => "paths", + } + ) + } +} + +#[derive(Copy, Clone, Debug)] +enum PathState { + Begin, + PathComponent, + PathSeparator, + Boundary, + NonWord, + End, +} + +#[tracing::instrument(skip(chars))] +fn extract_paths(chars: impl Iterator) -> Vec> { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut list = Vec::new(); + let mut path = Vec::new(); + let mut word = String::new(); + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some('/') => PathState::PathSeparator, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => { + path.push(word.clone()); + list.push(path.clone()); + + PathState::End + } + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => { + path.push(word.clone()); + word.clear(); + + PathState::PathSeparator + } + Some(c) if is_boundary(c) => { + path.push(word.clone()); + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + + path.clear(); + word.clear(); + + PathState::NonWord + } + }, + PathState::PathSeparator => match chars.next() { + None => { + list.push(path.clone()); + PathState::End + } + Some('/') => PathState::PathSeparator, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + list.push(path.clone()); + path.clear(); + PathState::Boundary + } + Some(_) => { + list.push(path.clone()); + path.clear(); + PathState::NonWord + } + }, + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + break 'machine; + } + } + } + + list +} + +#[tracing::instrument(skip(chars))] +fn algorithm_path_components(chars: impl Iterator, min_length: usize) { + let mut chars = chars.peekable(); + + let mut state = PathState::Begin; + let mut word = String::new(); + let mut lists = vec![HashMap::::new()]; + let mut index = 0; + + let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t'; + + 'machine: loop { + state = match state { + PathState::Begin => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + // Ignore leading path separators to not trigger the logic of advancing + // the component count + Some('/') => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::PathComponent => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphanumeric() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some('/') => PathState::PathSeparator, + Some(c) => { + if index > 0 && word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index = 0; + + if is_boundary(c) { + PathState::Boundary + } else { + PathState::NonWord + } + } + }, + PathState::PathSeparator => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + word.clear(); + + index += 1; + if lists.get(index).is_none() { + lists.push(HashMap::new()); + } + + // Ignore multiple separators + while chars.next_if(|c| *c == '/').is_some() {} + + match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() || c == '_' => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => { + index = 0; + PathState::Boundary + } + Some(_) => { + index = 0; + PathState::NonWord + } + } + } + PathState::Boundary => match chars.next() { + None => PathState::End, + Some(c) if c.is_ascii_alphabetic() => { + word.push(c); + PathState::PathComponent + } + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::NonWord => match chars.next() { + None => PathState::End, + Some(c) if is_boundary(c) => PathState::Boundary, + Some(_) => PathState::NonWord, + }, + PathState::End => { + if word.len() >= min_length { + let list = &mut lists[index]; + list.entry(word.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + break 'machine; + } + } + } + + for i in 0..lists.len() { + print!("Word {i}, Count {i},"); + } + println!(); + + let mut lines: Vec>> = Vec::new(); + + for (i, list) in lists.into_iter().enumerate() { + let mut entries = list.into_iter().collect::>(); + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + for (j, (word, count)) in entries.into_iter().enumerate() { + if let Some(line) = lines.get_mut(j) { + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + } else { + let mut line = Vec::new(); + while line.len() < i { + line.push(None); + } + line.push(Some((word, count))); + lines.push(line); + } + } + } + + for line in lines.iter() { + for cell in line.iter() { + if let Some((word, count)) = cell { + print!("{},{},", word, count); + } else { + print!(",,"); + } + } + println!(); + } +} + +#[derive(Copy, Clone, Debug)] +enum State { + Begin, + NonWord, + Word, + End, +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + let path = matches + .get_one::("file") + .expect("missing required parameter"); + + let algorithm = matches + .get_one::("algorithm") + .expect("parameter has default"); + + let min_length = matches + .get_one::("min-length") + .copied() + .expect("paramter has default"); + + let content = fs::read_to_string(&path) + .await + .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; + let mut chars = content.chars(); + + if *algorithm == Algorithm::Paths { + algorithm_path_components(chars, min_length); + return Ok(()); + } + + let mut state = State::Begin; + let mut word = String::new(); + let mut visited = HashMap::new(); + + 'machine: loop { + state = match state { + State::Begin => match chars.next() { + None => State::End, + Some(c) if algorithm.is_start(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::End => break 'machine, + State::NonWord => match chars.next() { + None => State::End, + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => State::NonWord, + }, + State::Word => match chars.next() { + None => { + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); + } + State::End + } + Some(c) if algorithm.is_body(c) => { + word.push(c); + State::Word + } + Some(_) => { + if word.len() >= min_length && algorithm.is_length(word.len()) { + visited + .entry(word.clone()) + .and_modify(|v| *v += 1) + .or_insert(1); + } + word.clear(); + State::NonWord + } + }, + } + } + + let mut entries: Vec<(String, usize)> = visited.into_iter().collect(); + // Reverse sides during comparison to get "highest to lowest" + entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + entries + .iter() + .for_each(|(word, count)| println!("{:016} {}", word, count)); + + Ok(()) +} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs new file mode 100644 index 0000000..c53d9b5 --- /dev/null +++ b/crates/dtmt/src/cmd/experiment/mod.rs @@ -0,0 +1,26 @@ +use clap::{ArgMatches, Command}; +use color_eyre::Result; + +mod brute_force_words; +mod extract_words; + +pub(crate) fn command_definition() -> Command { + Command::new("experiment") + .subcommand_required(true) + .about("A collection of utilities and experiments.") + .subcommand(brute_force_words::command_definition()) + .subcommand(extract_words::command_definition()) +} + +#[tracing::instrument(skip_all)] +pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { + match matches.subcommand() { + // It's fine to block here, as this is the only thing that's executing on the runtime. + // The other option with `spawn_blocking` would require setting up values to be Send+Sync. + Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches), + Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, + _ => unreachable!( + "clap is configured to require a subcommand, and they're all handled above" + ), + } +} diff --git a/crates/dtmt/src/main.rs b/crates/dtmt/src/main.rs index 2e10b17..b01956a 100644 --- a/crates/dtmt/src/main.rs +++ b/crates/dtmt/src/main.rs @@ -21,6 +21,7 @@ mod cmd { pub mod build; pub mod bundle; pub mod dictionary; + pub mod experiment; pub mod migrate; pub mod murmur; pub mod new; @@ -56,6 +57,7 @@ async fn main() -> Result<()> { .subcommand(cmd::build::command_definition()) .subcommand(cmd::bundle::command_definition()) .subcommand(cmd::dictionary::command_definition()) + .subcommand(cmd::experiment::command_definition()) .subcommand(cmd::migrate::command_definition()) .subcommand(cmd::murmur::command_definition()) .subcommand(cmd::new::command_definition()) @@ -133,6 +135,7 @@ async fn main() -> Result<()> { Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?, Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?, Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?, + Some(("experiment", sub_matches)) => cmd::experiment::run(ctx, sub_matches).await?, Some(("migrate", sub_matches)) => cmd::migrate::run(ctx, sub_matches).await?, Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?, Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,