diff --git a/Cargo.lock b/Cargo.lock index 3a02b55..a251de9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -161,17 +161,6 @@ dependencies = [ "system-deps", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi 0.1.19", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.3.0" @@ -223,7 +212,7 @@ dependencies = [ "bitflags 2.5.0", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools", "lazy_static", "lazycell", "log", @@ -658,20 +647,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" -dependencies = [ - "cfg-if", - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.12" @@ -681,40 +656,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "crossbeam-deque" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" -dependencies = [ - "cfg-if", - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" -dependencies = [ - "autocfg", - "cfg-if", - "crossbeam-utils", - "memoffset 0.9.1", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -986,18 +927,15 @@ name = "dtmt" version = "0.3.0" dependencies = [ "async-recursion", - "atty", "clap", "cli-table", "color-eyre", "confy", - "crossbeam", "csv-async", "dtmt-shared", "futures", "futures-util", "glob", - "itertools 0.11.0", "luajit2-sys", "nanorand", "notify", @@ -1660,15 +1598,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "hermit-abi" version = "0.3.9" @@ -1929,15 +1858,6 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.12.1" @@ -2347,7 +2267,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.9", + "hermit-abi", "libc", ] diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index e80feaa..d836a50 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -33,9 +33,6 @@ async-recursion = "1.0.2" notify = "6.1.1" luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } shlex = { version = "1.2.0", optional = true } -atty = "0.2.14" -itertools = "0.11.0" -crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] } [dev-dependencies] tempfile = "3.3.0" diff --git a/crates/dtmt/src/cmd/experiment/brute_force_words.rs b/crates/dtmt/src/cmd/experiment/brute_force_words.rs deleted file mode 100644 index 26d887f..0000000 --- a/crates/dtmt/src/cmd/experiment/brute_force_words.rs +++ /dev/null @@ -1,520 +0,0 @@ -use std::collections::HashSet; -use std::fs; -use std::io::{BufWriter, Write}; -use std::path::PathBuf; -use std::sync::Arc; -use std::thread::JoinHandle; - -use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; -use color_eyre::eyre::{self, Context}; -use color_eyre::Result; -use crossbeam::channel::{bounded, unbounded, Receiver, Sender}; -use itertools::Itertools; -use sdk::murmur::Murmur64; -use tokio::time::Instant; - -pub(crate) fn command_definition() -> Command { - Command::new("brute-force-words") - .about( - "Given a list of words and a set of delimiters, iteratevily creates permutations \ - of growing length.\n\ - Delimiters are placed between every word in the result.\n\n\ - Example: \ - Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \ - words will be\n\ - - packages\n\ - - boot\n\ - - packages/packages\n\ - - packages_packages\n\ - - packages/boot\n\ - - packages_boot\n\ - - boot/packages\n\ - - boot_packages\n\ - - boot/boot\n\ - - boot_boot", - ) - .arg( - Arg::new("delimiter") - .help( - "The delimiters to put between the words. \ - All permutations of this list will be tried for every string of words.\n\ - Specify multiple times to set multiple values.\n\ - Defaults to ['/', '_'].", - ) - .short('d') - .long("delimiter") - .action(ArgAction::Append), - ) - .arg( - Arg::new("max-length") - .help("The maximum number of words up to which to build strings.") - .long("max") - .long("max-length") - .short('m') - .default_value("5") - .value_parser(value_parser!(usize)), - ) - .arg( - Arg::new("continue") - .help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.") - .short('c') - .long("continue") - ) - .arg( - Arg::new("threads") - .help("The number of workers to run in parallel.") - .long("threads") - .short('n') - .default_value("6") - .value_parser(value_parser!(usize)) - ) - .arg( - Arg::new("words") - .help("Path to a file containing words line by line.") - .required(true) - .value_parser(value_parser!(PathBuf)), - ) - .arg( - Arg::new("hashes") - .help( - "Path to a file containing the hashes to attempt to brute force. \ - Hashes are expected in hexadecimal notiation. \ - Only 64-bit hashes are supported." - ) - .required(true) - .value_parser(value_parser!(PathBuf)), - ) -} - -const LINE_FEED: u8 = 0x0A; -const UNDERSCORE: u8 = 0x5F; -const ZERO: u8 = 0x30; - -const PREFIXES: [&str; 36] = [ - "", - "content/characters/", - "content/debug/", - "content/decals/", - "content/environment/", - "content/fx/", - "content/fx/particles/", - "content/gizmos/", - "content/items/", - "content/levels/", - "content/liquid_area/", - "content/localization/", - "content/materials/", - "content/minion_impact_assets/", - "content/pickups/", - "content/shading_environments/", - "content/textures/", - "content/ui/", - "content/videos/", - "content/vo/", - "content/volume_types/", - "content/weapons/", - "content/", - "core/", - "core/units/", - "packages/boot_assets/", - "packages/content/", - "packages/game_scripts/", - "packages/strings/", - "packages/ui/", - "packages/", - "wwise/events/", - "wwise/packages/", - "wwise/world_sound_fx/", - "wwise/events/weapons/", - "wwise/events/minions/", -]; - -fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> { - std::thread::spawn(move || { - let mut writer = std::io::stderr(); - let mut total_count = 0; - let mut total_found = 0; - - let mut start = Instant::now(); - - while let Ok((count, found, last)) = rx.recv() { - total_count += count; - total_found += found; - - let now = Instant::now(); - if (now - start).as_millis() > 250 { - let s = &last[0..std::cmp::min(last.len(), 60)]; - let s = format!( - "\r{:12} per second | {total_found:6}/{hash_count} found | {s:<60}", - total_count * 4 - ); - - writer.write_all(s.as_bytes()).unwrap(); - - total_count = 0; - start = now; - } - } - }) -} - -fn make_stdout_printer(rx: Receiver>) -> JoinHandle<()> { - std::thread::spawn(move || { - let mut writer = std::io::stdout(); - - while let Ok(buf) = rx.recv() { - writer.write_all(&buf).unwrap(); - } - }) -} - -struct State { - delimiter_lists: Arc>>, - hashes: Arc>, - words: Arc>, - delimiters_len: usize, - stdout_tx: Sender>, - info_tx: Sender<(usize, usize, String)>, -} - -fn make_worker(rx: Receiver>, state: State) -> JoinHandle<()> { - std::thread::spawn(move || { - let delimiter_lists = &state.delimiter_lists; - let hashes = &state.hashes; - let words = &state.words; - let delimiters_len = state.delimiters_len; - - let mut count = 0; - let mut found = 0; - let mut buf = Vec::with_capacity(1024); - - while let Ok(indices) = rx.recv() { - let sequence = indices.iter().map(|i| words[*i].as_str()); - - // We only want delimiters between words, so we keep that iterator shorter by - // one. - let delimiter_count = sequence.len() as u32 - 1; - - for prefix in PREFIXES.iter().map(|p| p.as_bytes()) { - buf.clear(); - - // We can keep the prefix at the front of the buffer and only - // replace the parts after that. - let prefix_len = prefix.len(); - buf.extend_from_slice(prefix); - - for delims in delimiter_lists - .iter() - .take(delimiters_len.pow(delimiter_count)) - { - buf.truncate(prefix_len); - - let delims = delims - .iter() - .map(|s| s.as_str()) - .take(delimiter_count as usize); - sequence - .clone() - .interleave(delims.clone()) - .for_each(|word| buf.extend_from_slice(word.as_bytes())); - - count += 1; - - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - - buf.push(LINE_FEED); - if state.stdout_tx.send(buf.clone()).is_err() { - return; - } - } else { - let word_len = buf.len(); - - // If the regular word itself didn't match, we check - // for numbered suffixes. - // For now, we only check up to `09` to avoid more complex logic - // writing into the buffer. - // Packages that contain files with higher numbers than this - // should hopefully become easier to spot once a good number of - // hashes is found. - for i in 1..=9 { - buf.truncate(word_len); - buf.push(UNDERSCORE); - buf.push(ZERO); - buf.push(ZERO + i); - - count += 1; - - let hash = Murmur64::hash(&buf); - if hashes.contains(&hash) { - found += 1; - - buf.push(LINE_FEED); - if state.stdout_tx.send(buf.clone()).is_err() { - return; - } - } else { - break; - } - } - } - } - } - - if count >= 2 * 1024 * 1024 { - // The last prefix in the set is the one that will stay in the buffer - // when we're about to print here. - // So we strip that, to show just the generated part. - // We also restrict the length to stay on a single line. - let prefix_len = PREFIXES[35].len(); - // No need to wait for this - let _ = state.info_tx.try_send(( - count, - found, - String::from_utf8_lossy(&buf[prefix_len..]).to_string(), - )); - - count = 0; - found = 0; - } - } - }) -} - -fn build_delimiter_lists(delimiters: impl AsRef<[String]>, max_length: usize) -> Vec> { - let delimiters = delimiters.as_ref(); - let mut indices = vec![0; max_length]; - let mut list = Vec::new(); - - for _ in 0..delimiters.len().pow(max_length as u32) { - list.push( - indices - .iter() - .map(|i| delimiters[*i].clone()) - .collect::>(), - ); - - for v in indices.iter_mut() { - if *v >= delimiters.len() - 1 { - *v = 0; - break; - } else { - *v += 1; - } - } - } - - list -} - -fn build_initial_indices( - cont: Option<&String>, - delimiters: impl AsRef<[String]>, - words: impl AsRef<[String]>, -) -> Result> { - if let Some(cont) = cont { - let mut splits = vec![cont.clone()]; - - for delim in delimiters.as_ref().iter() { - splits = splits - .iter() - .flat_map(|s| s.split(delim)) - .map(|s| s.to_string()) - .collect(); - } - - let indices = splits - .into_iter() - .map(|s| { - words - .as_ref() - .iter() - .enumerate() - .find(|(_, v)| s == **v) - .map(|(i, _)| i) - .ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s)) - }) - .collect::>()?; - - tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices); - - Ok(indices) - } else { - Ok(vec![0]) - } -} - -#[tracing::instrument(skip_all)] -#[allow(clippy::mut_range_bound)] -pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { - let max_length: usize = matches - .get_one::("max-length") - .copied() - .expect("parameter has default"); - - let num_threads: usize = matches - .get_one::("threads") - .copied() - .expect("parameter has default"); - - let words = { - let path = matches - .get_one::("words") - .expect("missing required parameter"); - - let file = fs::read_to_string(path) - .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; - - let words: Vec<_> = file.lines().map(str::to_string).collect(); - - if words.is_empty() { - eyre::bail!("Word list must not be empty"); - } - - Arc::new(words) - }; - - let hashes = { - let path = matches - .get_one::("hashes") - .expect("missing required argument"); - let content = fs::read_to_string(path) - .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; - - let hashes: Result, _> = content - .lines() - .map(|s| u64::from_str_radix(s, 16).map(Murmur64::from)) - .collect(); - - let hashes = hashes?; - - tracing::trace!("{:?}", hashes); - - Arc::new(hashes) - }; - - let mut delimiters: Vec = matches - .get_many::("delimiter") - .unwrap_or_default() - .cloned() - .collect(); - - if delimiters.is_empty() { - delimiters.push(String::from("/")); - delimiters.push(String::from("_")); - } - - let delimiters_len = delimiters.len(); - - let word_count = words.len(); - tracing::info!("{} words to try", word_count); - - // To be able to easily combine the permutations of words and delimiters, - // we turn the latter into a pre-defined list of all permutations of delimiters - // that are possible at the given amount of words. - // Combining `Iterator::cycle` with `Itertools::permutations` works, but - // with a high `max_length`, it runs OOM. - // So we basically have to implement a smaller version of the iterative algorithm we use later on - // to build permutations of the actual words. - let delimiter_lists = { - let lists = build_delimiter_lists(&delimiters, max_length - 1); - Arc::new(lists) - }; - tracing::debug!("{:?}", delimiter_lists); - - let (info_tx, info_rx) = bounded(100); - let (stdout_tx, stdout_rx) = unbounded::>(); - let (task_tx, task_rx) = bounded::>(num_threads * 4); - let mut handles = Vec::new(); - - for _ in 0..num_threads { - let handle = make_worker( - task_rx.clone(), - State { - delimiter_lists: Arc::clone(&delimiter_lists), - hashes: Arc::clone(&hashes), - words: Arc::clone(&words), - delimiters_len, - stdout_tx: stdout_tx.clone(), - info_tx: info_tx.clone(), - }, - ); - handles.push(handle); - } - // These are only used inside the worker threads, but due to the loops above, we had to - // clone them one too many times. - // So we drop that extra reference immediately, to ensure that the channels can - // disconnect properly when the threads finish. - drop(stdout_tx); - drop(info_tx); - - handles.push(make_info_printer(info_rx, hashes.len())); - handles.push(make_stdout_printer(stdout_rx)); - - let mut indices = - build_initial_indices(matches.get_one::("continue"), &delimiters, &*words) - .wrap_err("Failed to build initial indices")?; - let mut indices_len = indices.len(); - let mut sequence = indices - .iter() - .map(|index| words[*index].as_str()) - .collect::>(); - - // Prevent re-allocation by reserving as much as we need upfront - indices.reserve(max_length); - sequence.reserve(max_length); - - 'outer: loop { - task_tx.send(indices.clone())?; - - for i in 0..indices_len { - let index = indices.get_mut(i).unwrap(); - let word = sequence.get_mut(i).unwrap(); - - if *index >= word_count - 1 { - *index = 0; - *word = words[*index].as_str(); - - if indices.get(i + 1).is_none() { - indices_len += 1; - - if indices_len > max_length { - break 'outer; - } - - indices.push(0); - sequence.push(words[0].as_str()); - - break; - } - } else { - *index += 1; - *word = words[*index].as_str(); - break; - } - } - } - - // Dropping the senders will disconnect the channel, - // so that the threads holding the other end will eventually - // complete as well. - drop(task_tx); - - for handle in handles { - match handle.join() { - Ok(_) => {} - Err(value) => { - if let Some(err) = value.downcast_ref::() { - eyre::bail!("Thread failed: {}", err); - } else { - eyre::bail!("Thread failed with unknown error: {:?}", value); - } - } - } - } - - let _ = std::io::stdout().write_all("\r".as_bytes()); - - Ok(()) -} diff --git a/crates/dtmt/src/cmd/experiment/extract_words.rs b/crates/dtmt/src/cmd/experiment/extract_words.rs deleted file mode 100644 index 1a8cda5..0000000 --- a/crates/dtmt/src/cmd/experiment/extract_words.rs +++ /dev/null @@ -1,463 +0,0 @@ -use std::collections::HashMap; -use std::path::PathBuf; - -use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum}; -use color_eyre::eyre::Context; -use color_eyre::Result; -use tokio::fs; - -pub(crate) fn command_definition() -> Command { - Command::new("extract-words") - .about( - "Extract unique alphanumeric sequences that match common identifier rules from the given file. \ - Only ASCII is supported.", - ) - .arg( - Arg::new("file") - .required(true) - .value_parser(value_parser!(PathBuf)) - .help("Path to the file to extract words from."), - ) - .arg( - Arg::new("min-length") - .help("Minimum length to consider a word.") - .long("min-length") - .short('m') - .default_value("3") - .value_parser(value_parser!(usize)) - ) - .arg( - Arg::new("algorithm") - .help("The algorithm to determine matching words") - .long("algorithm") - .short('a') - .default_value("identifier") - .value_parser(value_parser!(Algorithm)) - ) -} - -#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)] -#[value(rename_all = "snake_case")] -enum Algorithm { - Alphabetic, - Alphanumeric, - Identifier, - Number, - Hash32, - Hash64, - Paths, -} - -impl Algorithm { - fn is_start(&self, c: char) -> bool { - match self { - Self::Alphabetic => c.is_ascii_alphabetic(), - Self::Alphanumeric => c.is_ascii_alphanumeric(), - Self::Identifier => c.is_ascii_alphabetic(), - Self::Number => c.is_numeric(), - Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), - // Supposed to be handled separately - Self::Paths => false, - } - } - - fn is_body(&self, c: char) -> bool { - match self { - Self::Alphabetic => c.is_ascii_alphabetic(), - Self::Alphanumeric => c.is_ascii_alphanumeric(), - Self::Identifier => c.is_ascii_alphanumeric(), - Self::Number => c.is_numeric(), - Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), - // Supposed to be handled separately - Self::Paths => false, - } - } - - fn is_length(&self, len: usize) -> bool { - match self { - Self::Alphabetic => true, - Self::Alphanumeric => true, - Self::Identifier => true, - Self::Number => true, - Self::Hash32 => len == 8, - Self::Hash64 => len == 16, - // Supposed to be handled separately - Self::Paths => false, - } - } -} - -impl std::fmt::Display for Algorithm { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}", - match self { - Algorithm::Alphabetic => "alphabetic", - Algorithm::Alphanumeric => "alphanumeric", - Algorithm::Identifier => "identifier", - Algorithm::Number => "number", - Algorithm::Hash32 => "hash32", - Algorithm::Hash64 => "hash64", - Algorithm::Paths => "paths", - } - ) - } -} - -#[derive(Copy, Clone, Debug)] -enum PathState { - Begin, - PathComponent, - PathSeparator, - Boundary, - NonWord, - End, -} - -#[tracing::instrument(skip(chars))] -fn extract_paths(chars: impl Iterator) -> Vec> { - let mut chars = chars.peekable(); - - let mut state = PathState::Begin; - let mut list = Vec::new(); - let mut path = Vec::new(); - let mut word = String::new(); - - let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|'; - - 'machine: loop { - state = match state { - PathState::Begin => match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphabetic() => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => PathState::Boundary, - Some('/') => PathState::PathSeparator, - Some(_) => PathState::NonWord, - }, - PathState::PathComponent => match chars.next() { - None => { - path.push(word.clone()); - list.push(path.clone()); - - PathState::End - } - Some(c) if c.is_ascii_alphanumeric() || c == '_' => { - word.push(c); - PathState::PathComponent - } - Some('/') => { - path.push(word.clone()); - word.clear(); - - PathState::PathSeparator - } - Some(c) if is_boundary(c) => { - path.push(word.clone()); - list.push(path.clone()); - - path.clear(); - word.clear(); - - PathState::Boundary - } - Some(_) => { - list.push(path.clone()); - - path.clear(); - word.clear(); - - PathState::NonWord - } - }, - PathState::PathSeparator => match chars.next() { - None => { - list.push(path.clone()); - PathState::End - } - Some('/') => PathState::PathSeparator, - Some(c) if c.is_ascii_alphabetic() || c == '_' => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => { - list.push(path.clone()); - path.clear(); - PathState::Boundary - } - Some(_) => { - list.push(path.clone()); - path.clear(); - PathState::NonWord - } - }, - PathState::Boundary => match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphabetic() => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => PathState::Boundary, - Some(_) => PathState::NonWord, - }, - PathState::NonWord => match chars.next() { - None => PathState::End, - Some(c) if is_boundary(c) => PathState::Boundary, - Some(_) => PathState::NonWord, - }, - PathState::End => { - break 'machine; - } - } - } - - list -} - -#[tracing::instrument(skip(chars))] -fn algorithm_path_components(chars: impl Iterator, min_length: usize) { - let mut chars = chars.peekable(); - - let mut state = PathState::Begin; - let mut word = String::new(); - let mut lists = vec![HashMap::::new()]; - let mut index = 0; - - let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t'; - - 'machine: loop { - state = match state { - PathState::Begin => match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphabetic() => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => PathState::Boundary, - // Ignore leading path separators to not trigger the logic of advancing - // the component count - Some('/') => PathState::Boundary, - Some(_) => PathState::NonWord, - }, - PathState::PathComponent => match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphanumeric() || c == '_' => { - word.push(c); - PathState::PathComponent - } - Some('/') => PathState::PathSeparator, - Some(c) => { - if index > 0 && word.len() >= min_length { - let list = &mut lists[index]; - list.entry(word.clone()) - .and_modify(|count| *count += 1) - .or_insert(1); - } - word.clear(); - - index = 0; - - if is_boundary(c) { - PathState::Boundary - } else { - PathState::NonWord - } - } - }, - PathState::PathSeparator => { - if word.len() >= min_length { - let list = &mut lists[index]; - list.entry(word.clone()) - .and_modify(|count| *count += 1) - .or_insert(1); - } - word.clear(); - - index += 1; - if lists.get(index).is_none() { - lists.push(HashMap::new()); - } - - // Ignore multiple separators - while chars.next_if(|c| *c == '/').is_some() {} - - match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphabetic() || c == '_' => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => { - index = 0; - PathState::Boundary - } - Some(_) => { - index = 0; - PathState::NonWord - } - } - } - PathState::Boundary => match chars.next() { - None => PathState::End, - Some(c) if c.is_ascii_alphabetic() => { - word.push(c); - PathState::PathComponent - } - Some(c) if is_boundary(c) => PathState::Boundary, - Some(_) => PathState::NonWord, - }, - PathState::NonWord => match chars.next() { - None => PathState::End, - Some(c) if is_boundary(c) => PathState::Boundary, - Some(_) => PathState::NonWord, - }, - PathState::End => { - if word.len() >= min_length { - let list = &mut lists[index]; - list.entry(word.clone()) - .and_modify(|count| *count += 1) - .or_insert(1); - } - - break 'machine; - } - } - } - - for i in 0..lists.len() { - print!("Word {i}, Count {i},"); - } - println!(); - - let mut lines: Vec>> = Vec::new(); - - for (i, list) in lists.into_iter().enumerate() { - let mut entries = list.into_iter().collect::>(); - entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); - - for (j, (word, count)) in entries.into_iter().enumerate() { - if let Some(line) = lines.get_mut(j) { - while line.len() < i { - line.push(None); - } - line.push(Some((word, count))); - } else { - let mut line = Vec::new(); - while line.len() < i { - line.push(None); - } - line.push(Some((word, count))); - lines.push(line); - } - } - } - - for line in lines.iter() { - for cell in line.iter() { - if let Some((word, count)) = cell { - print!("{},{},", word, count); - } else { - print!(",,"); - } - } - println!(); - } -} - -#[derive(Copy, Clone, Debug)] -enum State { - Begin, - NonWord, - Word, - End, -} - -#[tracing::instrument(skip_all)] -pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { - let path = matches - .get_one::("file") - .expect("missing required parameter"); - - let algorithm = matches - .get_one::("algorithm") - .expect("parameter has default"); - - let min_length = matches - .get_one::("min-length") - .copied() - .expect("paramter has default"); - - let content = fs::read_to_string(&path) - .await - .wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?; - let mut chars = content.chars(); - - if *algorithm == Algorithm::Paths { - algorithm_path_components(chars, min_length); - return Ok(()); - } - - let mut state = State::Begin; - let mut word = String::new(); - let mut visited = HashMap::new(); - - 'machine: loop { - state = match state { - State::Begin => match chars.next() { - None => State::End, - Some(c) if algorithm.is_start(c) => { - word.push(c); - State::Word - } - Some(_) => State::NonWord, - }, - State::End => break 'machine, - State::NonWord => match chars.next() { - None => State::End, - Some(c) if algorithm.is_body(c) => { - word.push(c); - State::Word - } - Some(_) => State::NonWord, - }, - State::Word => match chars.next() { - None => { - if word.len() >= min_length && algorithm.is_length(word.len()) { - visited - .entry(word.clone()) - .and_modify(|v| *v += 1) - .or_insert(1); - } - State::End - } - Some(c) if algorithm.is_body(c) => { - word.push(c); - State::Word - } - Some(_) => { - if word.len() >= min_length && algorithm.is_length(word.len()) { - visited - .entry(word.clone()) - .and_modify(|v| *v += 1) - .or_insert(1); - } - word.clear(); - State::NonWord - } - }, - } - } - - let mut entries: Vec<(String, usize)> = visited.into_iter().collect(); - // Reverse sides during comparison to get "highest to lowest" - entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); - - entries - .iter() - .for_each(|(word, count)| println!("{:016} {}", word, count)); - - Ok(()) -} diff --git a/crates/dtmt/src/cmd/experiment/mod.rs b/crates/dtmt/src/cmd/experiment/mod.rs deleted file mode 100644 index c53d9b5..0000000 --- a/crates/dtmt/src/cmd/experiment/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -use clap::{ArgMatches, Command}; -use color_eyre::Result; - -mod brute_force_words; -mod extract_words; - -pub(crate) fn command_definition() -> Command { - Command::new("experiment") - .subcommand_required(true) - .about("A collection of utilities and experiments.") - .subcommand(brute_force_words::command_definition()) - .subcommand(extract_words::command_definition()) -} - -#[tracing::instrument(skip_all)] -pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> { - match matches.subcommand() { - // It's fine to block here, as this is the only thing that's executing on the runtime. - // The other option with `spawn_blocking` would require setting up values to be Send+Sync. - Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches), - Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await, - _ => unreachable!( - "clap is configured to require a subcommand, and they're all handled above" - ), - } -} diff --git a/crates/dtmt/src/main.rs b/crates/dtmt/src/main.rs index b01956a..2e10b17 100644 --- a/crates/dtmt/src/main.rs +++ b/crates/dtmt/src/main.rs @@ -21,7 +21,6 @@ mod cmd { pub mod build; pub mod bundle; pub mod dictionary; - pub mod experiment; pub mod migrate; pub mod murmur; pub mod new; @@ -57,7 +56,6 @@ async fn main() -> Result<()> { .subcommand(cmd::build::command_definition()) .subcommand(cmd::bundle::command_definition()) .subcommand(cmd::dictionary::command_definition()) - .subcommand(cmd::experiment::command_definition()) .subcommand(cmd::migrate::command_definition()) .subcommand(cmd::murmur::command_definition()) .subcommand(cmd::new::command_definition()) @@ -135,7 +133,6 @@ async fn main() -> Result<()> { Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?, Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?, Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?, - Some(("experiment", sub_matches)) => cmd::experiment::run(ctx, sub_matches).await?, Some(("migrate", sub_matches)) => cmd::migrate::run(ctx, sub_matches).await?, Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?, Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,