Compare commits

...

10 commits

Author SHA1 Message Date
ae1e7e5aa6
dtmt: Add word extraction algorithm for paths 2024-07-17 09:29:41 +02:00
6ada4c1c43
sdk: Add additional brute force prefixes 2024-07-17 09:29:39 +02:00
6449354714
sdk: Reimplement logging current word 2024-07-17 09:29:37 +02:00
b366185a63
sdk: Implement worker pool for word generation
Massive speed improvement. The index generation is really fast,
and it appears that even worker numbers way higher than the core/thread
count still increase the throughput slightly.

The only missing part is the info output. That's broken, currently.
2024-07-17 09:29:21 +02:00
951a7f82c0
sdk: Improve word generation 2024-07-17 09:28:01 +02:00
4480144d92
sdk: Implement guessing a list of hashes
While the approach to generate and store a list of strings does allow
for this list to be re-used in the future, the I/O involved turned out
to be quite costly.

While the generation can run at up to 500 MiB/s, even compressing that
on the fly doesn't reach fast enough write speeds on a HDD.
And compression is also necessary to store this amount of data
(generation reached two TB of raw data with a word length of just three,
which is still 600 GB compressed).
But compression also makes working with that data a lot harder.

So this instead combines both the generation and search into a single
step. The intermediate result of the generation is therefore lost,
but the overall pipeline is much faster.
2024-07-17 09:27:59 +02:00
0d1193a126
sdk: Improve word generation throughput
It seems that the simple `println!()` is really bad when the goal
is to write a lot of data to stdout.
Presumably because it's unbuffered, but also because it required the
preceding code to do a lot of allocations.

This was replaced with a buffered writer on stdout, as well as an extra
`Vec<u8>` that I can write everything to directly from the word and
delimiter iterators, without allocating a single new structure.
2024-07-17 09:27:57 +02:00
6485dae27b
experiment: Add command to create word permutations
This creates candidate values to brute force dictionary entries with,
by building combinations from a word list and delimiters.
2024-07-17 09:27:46 +02:00
94347d57f9
dtmt: Add command to extract words from file
As part of trying to brute force values for the dictionary,
this allows extracting candidate words from a file.
2024-07-17 09:20:54 +02:00
2daff544a5
Add subcommand for experimental operations
These may be temporary ones that help during analyzing and developing
file formats, or or long term experiments.
2024-07-17 09:18:56 +02:00
6 changed files with 1097 additions and 2 deletions

84
Cargo.lock generated
View file

@ -161,6 +161,17 @@ dependencies = [
"system-deps",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi 0.1.19",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.3.0"
@ -212,7 +223,7 @@ dependencies = [
"bitflags 2.5.0",
"cexpr",
"clang-sys",
"itertools",
"itertools 0.12.1",
"lazy_static",
"lazycell",
"log",
@ -647,6 +658,20 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
dependencies = [
"cfg-if",
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-epoch",
"crossbeam-queue",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.12"
@ -656,6 +681,40 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"memoffset 0.9.1",
"scopeguard",
]
[[package]]
name = "crossbeam-queue"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.20"
@ -927,15 +986,18 @@ name = "dtmt"
version = "0.3.0"
dependencies = [
"async-recursion",
"atty",
"clap",
"cli-table",
"color-eyre",
"confy",
"crossbeam",
"csv-async",
"dtmt-shared",
"futures",
"futures-util",
"glob",
"itertools 0.11.0",
"luajit2-sys",
"nanorand",
"notify",
@ -1598,6 +1660,15 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.9"
@ -1858,6 +1929,15 @@ version = "1.70.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
@ -2267,7 +2347,7 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.9",
"libc",
]

View file

@ -33,6 +33,9 @@ async-recursion = "1.0.2"
notify = "6.1.1"
luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
shlex = { version = "1.2.0", optional = true }
atty = "0.2.14"
itertools = "0.11.0"
crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] }
[dev-dependencies]
tempfile = "3.3.0"

View file

@ -0,0 +1,520 @@
use std::collections::HashSet;
use std::fs;
use std::io::{BufWriter, Write};
use std::path::PathBuf;
use std::sync::Arc;
use std::thread::JoinHandle;
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use color_eyre::eyre::{self, Context};
use color_eyre::Result;
use crossbeam::channel::{bounded, unbounded, Receiver, Sender};
use itertools::Itertools;
use sdk::murmur::Murmur64;
use tokio::time::Instant;
pub(crate) fn command_definition() -> Command {
Command::new("brute-force-words")
.about(
"Given a list of words and a set of delimiters, iteratevily creates permutations \
of growing length.\n\
Delimiters are placed between every word in the result.\n\n\
Example: \
Given the words ['packages', 'boot'], the delimiters ['/', '_'] and a length of 2, the resulting \
words will be\n\
- packages\n\
- boot\n\
- packages/packages\n\
- packages_packages\n\
- packages/boot\n\
- packages_boot\n\
- boot/packages\n\
- boot_packages\n\
- boot/boot\n\
- boot_boot",
)
.arg(
Arg::new("delimiter")
.help(
"The delimiters to put between the words. \
All permutations of this list will be tried for every string of words.\n\
Specify multiple times to set multiple values.\n\
Defaults to ['/', '_'].",
)
.short('d')
.long("delimiter")
.action(ArgAction::Append),
)
.arg(
Arg::new("max-length")
.help("The maximum number of words up to which to build strings.")
.long("max")
.long("max-length")
.short('m')
.default_value("5")
.value_parser(value_parser!(usize)),
)
.arg(
Arg::new("continue")
.help("Can be used to continue a previous operation where it stopped. Word list and delimiters must match.")
.short('c')
.long("continue")
)
.arg(
Arg::new("threads")
.help("The number of workers to run in parallel.")
.long("threads")
.short('n')
.default_value("6")
.value_parser(value_parser!(usize))
)
.arg(
Arg::new("words")
.help("Path to a file containing words line by line.")
.required(true)
.value_parser(value_parser!(PathBuf)),
)
.arg(
Arg::new("hashes")
.help(
"Path to a file containing the hashes to attempt to brute force. \
Hashes are expected in hexadecimal notiation. \
Only 64-bit hashes are supported."
)
.required(true)
.value_parser(value_parser!(PathBuf)),
)
}
const LINE_FEED: u8 = 0x0A;
const UNDERSCORE: u8 = 0x5F;
const ZERO: u8 = 0x30;
const PREFIXES: [&str; 36] = [
"",
"content/characters/",
"content/debug/",
"content/decals/",
"content/environment/",
"content/fx/",
"content/fx/particles/",
"content/gizmos/",
"content/items/",
"content/levels/",
"content/liquid_area/",
"content/localization/",
"content/materials/",
"content/minion_impact_assets/",
"content/pickups/",
"content/shading_environments/",
"content/textures/",
"content/ui/",
"content/videos/",
"content/vo/",
"content/volume_types/",
"content/weapons/",
"content/",
"core/",
"core/units/",
"packages/boot_assets/",
"packages/content/",
"packages/game_scripts/",
"packages/strings/",
"packages/ui/",
"packages/",
"wwise/events/",
"wwise/packages/",
"wwise/world_sound_fx/",
"wwise/events/weapons/",
"wwise/events/minions/",
];
fn make_info_printer(rx: Receiver<(usize, usize, String)>, hash_count: usize) -> JoinHandle<()> {
std::thread::spawn(move || {
let mut writer = std::io::stderr();
let mut total_count = 0;
let mut total_found = 0;
let mut start = Instant::now();
while let Ok((count, found, last)) = rx.recv() {
total_count += count;
total_found += found;
let now = Instant::now();
if (now - start).as_millis() > 250 {
let s = &last[0..std::cmp::min(last.len(), 60)];
let s = format!(
"\r{:12} per second | {total_found:6}/{hash_count} found | {s:<60}",
total_count * 4
);
writer.write_all(s.as_bytes()).unwrap();
total_count = 0;
start = now;
}
}
})
}
fn make_stdout_printer(rx: Receiver<Vec<u8>>) -> JoinHandle<()> {
std::thread::spawn(move || {
let mut writer = std::io::stdout();
while let Ok(buf) = rx.recv() {
writer.write_all(&buf).unwrap();
}
})
}
struct State {
delimiter_lists: Arc<Vec<Vec<String>>>,
hashes: Arc<HashSet<Murmur64>>,
words: Arc<Vec<String>>,
delimiters_len: usize,
stdout_tx: Sender<Vec<u8>>,
info_tx: Sender<(usize, usize, String)>,
}
fn make_worker(rx: Receiver<Vec<usize>>, state: State) -> JoinHandle<()> {
std::thread::spawn(move || {
let delimiter_lists = &state.delimiter_lists;
let hashes = &state.hashes;
let words = &state.words;
let delimiters_len = state.delimiters_len;
let mut count = 0;
let mut found = 0;
let mut buf = Vec::with_capacity(1024);
while let Ok(indices) = rx.recv() {
let sequence = indices.iter().map(|i| words[*i].as_str());
// We only want delimiters between words, so we keep that iterator shorter by
// one.
let delimiter_count = sequence.len() as u32 - 1;
for prefix in PREFIXES.iter().map(|p| p.as_bytes()) {
buf.clear();
// We can keep the prefix at the front of the buffer and only
// replace the parts after that.
let prefix_len = prefix.len();
buf.extend_from_slice(prefix);
for delims in delimiter_lists
.iter()
.take(delimiters_len.pow(delimiter_count))
{
buf.truncate(prefix_len);
let delims = delims
.iter()
.map(|s| s.as_str())
.take(delimiter_count as usize);
sequence
.clone()
.interleave(delims.clone())
.for_each(|word| buf.extend_from_slice(word.as_bytes()));
count += 1;
let hash = Murmur64::hash(&buf);
if hashes.contains(&hash) {
found += 1;
buf.push(LINE_FEED);
if state.stdout_tx.send(buf.clone()).is_err() {
return;
}
} else {
let word_len = buf.len();
// If the regular word itself didn't match, we check
// for numbered suffixes.
// For now, we only check up to `09` to avoid more complex logic
// writing into the buffer.
// Packages that contain files with higher numbers than this
// should hopefully become easier to spot once a good number of
// hashes is found.
for i in 1..=9 {
buf.truncate(word_len);
buf.push(UNDERSCORE);
buf.push(ZERO);
buf.push(ZERO + i);
count += 1;
let hash = Murmur64::hash(&buf);
if hashes.contains(&hash) {
found += 1;
buf.push(LINE_FEED);
if state.stdout_tx.send(buf.clone()).is_err() {
return;
}
} else {
break;
}
}
}
}
}
if count >= 2 * 1024 * 1024 {
// The last prefix in the set is the one that will stay in the buffer
// when we're about to print here.
// So we strip that, to show just the generated part.
// We also restrict the length to stay on a single line.
let prefix_len = PREFIXES[35].len();
// No need to wait for this
let _ = state.info_tx.try_send((
count,
found,
String::from_utf8_lossy(&buf[prefix_len..]).to_string(),
));
count = 0;
found = 0;
}
}
})
}
fn build_delimiter_lists(delimiters: impl AsRef<[String]>, max_length: usize) -> Vec<Vec<String>> {
let delimiters = delimiters.as_ref();
let mut indices = vec![0; max_length];
let mut list = Vec::new();
for _ in 0..delimiters.len().pow(max_length as u32) {
list.push(
indices
.iter()
.map(|i| delimiters[*i].clone())
.collect::<Vec<_>>(),
);
for v in indices.iter_mut() {
if *v >= delimiters.len() - 1 {
*v = 0;
break;
} else {
*v += 1;
}
}
}
list
}
fn build_initial_indices(
cont: Option<&String>,
delimiters: impl AsRef<[String]>,
words: impl AsRef<[String]>,
) -> Result<Vec<usize>> {
if let Some(cont) = cont {
let mut splits = vec![cont.clone()];
for delim in delimiters.as_ref().iter() {
splits = splits
.iter()
.flat_map(|s| s.split(delim))
.map(|s| s.to_string())
.collect();
}
let indices = splits
.into_iter()
.map(|s| {
words
.as_ref()
.iter()
.enumerate()
.find(|(_, v)| s == **v)
.map(|(i, _)| i)
.ok_or_else(|| eyre::eyre!("'{}' is not in the word list", s))
})
.collect::<Result<_>>()?;
tracing::info!("Continuing from '{}' -> '{:?}'", cont, &indices);
Ok(indices)
} else {
Ok(vec![0])
}
}
#[tracing::instrument(skip_all)]
#[allow(clippy::mut_range_bound)]
pub(crate) fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
let max_length: usize = matches
.get_one::<usize>("max-length")
.copied()
.expect("parameter has default");
let num_threads: usize = matches
.get_one::<usize>("threads")
.copied()
.expect("parameter has default");
let words = {
let path = matches
.get_one::<PathBuf>("words")
.expect("missing required parameter");
let file = fs::read_to_string(path)
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
let words: Vec<_> = file.lines().map(str::to_string).collect();
if words.is_empty() {
eyre::bail!("Word list must not be empty");
}
Arc::new(words)
};
let hashes = {
let path = matches
.get_one::<PathBuf>("hashes")
.expect("missing required argument");
let content = fs::read_to_string(path)
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
let hashes: Result<HashSet<_>, _> = content
.lines()
.map(|s| u64::from_str_radix(s, 16).map(Murmur64::from))
.collect();
let hashes = hashes?;
tracing::trace!("{:?}", hashes);
Arc::new(hashes)
};
let mut delimiters: Vec<String> = matches
.get_many::<String>("delimiter")
.unwrap_or_default()
.cloned()
.collect();
if delimiters.is_empty() {
delimiters.push(String::from("/"));
delimiters.push(String::from("_"));
}
let delimiters_len = delimiters.len();
let word_count = words.len();
tracing::info!("{} words to try", word_count);
// To be able to easily combine the permutations of words and delimiters,
// we turn the latter into a pre-defined list of all permutations of delimiters
// that are possible at the given amount of words.
// Combining `Iterator::cycle` with `Itertools::permutations` works, but
// with a high `max_length`, it runs OOM.
// So we basically have to implement a smaller version of the iterative algorithm we use later on
// to build permutations of the actual words.
let delimiter_lists = {
let lists = build_delimiter_lists(&delimiters, max_length - 1);
Arc::new(lists)
};
tracing::debug!("{:?}", delimiter_lists);
let (info_tx, info_rx) = bounded(100);
let (stdout_tx, stdout_rx) = unbounded::<Vec<u8>>();
let (task_tx, task_rx) = bounded::<Vec<usize>>(num_threads * 4);
let mut handles = Vec::new();
for _ in 0..num_threads {
let handle = make_worker(
task_rx.clone(),
State {
delimiter_lists: Arc::clone(&delimiter_lists),
hashes: Arc::clone(&hashes),
words: Arc::clone(&words),
delimiters_len,
stdout_tx: stdout_tx.clone(),
info_tx: info_tx.clone(),
},
);
handles.push(handle);
}
// These are only used inside the worker threads, but due to the loops above, we had to
// clone them one too many times.
// So we drop that extra reference immediately, to ensure that the channels can
// disconnect properly when the threads finish.
drop(stdout_tx);
drop(info_tx);
handles.push(make_info_printer(info_rx, hashes.len()));
handles.push(make_stdout_printer(stdout_rx));
let mut indices =
build_initial_indices(matches.get_one::<String>("continue"), &delimiters, &*words)
.wrap_err("Failed to build initial indices")?;
let mut indices_len = indices.len();
let mut sequence = indices
.iter()
.map(|index| words[*index].as_str())
.collect::<Vec<_>>();
// Prevent re-allocation by reserving as much as we need upfront
indices.reserve(max_length);
sequence.reserve(max_length);
'outer: loop {
task_tx.send(indices.clone())?;
for i in 0..indices_len {
let index = indices.get_mut(i).unwrap();
let word = sequence.get_mut(i).unwrap();
if *index >= word_count - 1 {
*index = 0;
*word = words[*index].as_str();
if indices.get(i + 1).is_none() {
indices_len += 1;
if indices_len > max_length {
break 'outer;
}
indices.push(0);
sequence.push(words[0].as_str());
break;
}
} else {
*index += 1;
*word = words[*index].as_str();
break;
}
}
}
// Dropping the senders will disconnect the channel,
// so that the threads holding the other end will eventually
// complete as well.
drop(task_tx);
for handle in handles {
match handle.join() {
Ok(_) => {}
Err(value) => {
if let Some(err) = value.downcast_ref::<String>() {
eyre::bail!("Thread failed: {}", err);
} else {
eyre::bail!("Thread failed with unknown error: {:?}", value);
}
}
}
}
let _ = std::io::stdout().write_all("\r".as_bytes());
Ok(())
}

View file

@ -0,0 +1,463 @@
use std::collections::HashMap;
use std::path::PathBuf;
use clap::{value_parser, Arg, ArgMatches, Command, ValueEnum};
use color_eyre::eyre::Context;
use color_eyre::Result;
use tokio::fs;
pub(crate) fn command_definition() -> Command {
Command::new("extract-words")
.about(
"Extract unique alphanumeric sequences that match common identifier rules from the given file. \
Only ASCII is supported.",
)
.arg(
Arg::new("file")
.required(true)
.value_parser(value_parser!(PathBuf))
.help("Path to the file to extract words from."),
)
.arg(
Arg::new("min-length")
.help("Minimum length to consider a word.")
.long("min-length")
.short('m')
.default_value("3")
.value_parser(value_parser!(usize))
)
.arg(
Arg::new("algorithm")
.help("The algorithm to determine matching words")
.long("algorithm")
.short('a')
.default_value("identifier")
.value_parser(value_parser!(Algorithm))
)
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
#[value(rename_all = "snake_case")]
enum Algorithm {
Alphabetic,
Alphanumeric,
Identifier,
Number,
Hash32,
Hash64,
Paths,
}
impl Algorithm {
fn is_start(&self, c: char) -> bool {
match self {
Self::Alphabetic => c.is_ascii_alphabetic(),
Self::Alphanumeric => c.is_ascii_alphanumeric(),
Self::Identifier => c.is_ascii_alphabetic(),
Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
// Supposed to be handled separately
Self::Paths => false,
}
}
fn is_body(&self, c: char) -> bool {
match self {
Self::Alphabetic => c.is_ascii_alphabetic(),
Self::Alphanumeric => c.is_ascii_alphanumeric(),
Self::Identifier => c.is_ascii_alphanumeric(),
Self::Number => c.is_numeric(),
Self::Hash32 | Self::Hash64 => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'),
// Supposed to be handled separately
Self::Paths => false,
}
}
fn is_length(&self, len: usize) -> bool {
match self {
Self::Alphabetic => true,
Self::Alphanumeric => true,
Self::Identifier => true,
Self::Number => true,
Self::Hash32 => len == 8,
Self::Hash64 => len == 16,
// Supposed to be handled separately
Self::Paths => false,
}
}
}
impl std::fmt::Display for Algorithm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
Algorithm::Alphabetic => "alphabetic",
Algorithm::Alphanumeric => "alphanumeric",
Algorithm::Identifier => "identifier",
Algorithm::Number => "number",
Algorithm::Hash32 => "hash32",
Algorithm::Hash64 => "hash64",
Algorithm::Paths => "paths",
}
)
}
}
#[derive(Copy, Clone, Debug)]
enum PathState {
Begin,
PathComponent,
PathSeparator,
Boundary,
NonWord,
End,
}
#[tracing::instrument(skip(chars))]
fn extract_paths(chars: impl Iterator<Item = char>) -> Vec<Vec<String>> {
let mut chars = chars.peekable();
let mut state = PathState::Begin;
let mut list = Vec::new();
let mut path = Vec::new();
let mut word = String::new();
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t' || c == '|';
'machine: loop {
state = match state {
PathState::Begin => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some('/') => PathState::PathSeparator,
Some(_) => PathState::NonWord,
},
PathState::PathComponent => match chars.next() {
None => {
path.push(word.clone());
list.push(path.clone());
PathState::End
}
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some('/') => {
path.push(word.clone());
word.clear();
PathState::PathSeparator
}
Some(c) if is_boundary(c) => {
path.push(word.clone());
list.push(path.clone());
path.clear();
word.clear();
PathState::Boundary
}
Some(_) => {
list.push(path.clone());
path.clear();
word.clear();
PathState::NonWord
}
},
PathState::PathSeparator => match chars.next() {
None => {
list.push(path.clone());
PathState::End
}
Some('/') => PathState::PathSeparator,
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => {
list.push(path.clone());
path.clear();
PathState::Boundary
}
Some(_) => {
list.push(path.clone());
path.clear();
PathState::NonWord
}
},
PathState::Boundary => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::NonWord => match chars.next() {
None => PathState::End,
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::End => {
break 'machine;
}
}
}
list
}
#[tracing::instrument(skip(chars))]
fn algorithm_path_components(chars: impl Iterator<Item = char>, min_length: usize) {
let mut chars = chars.peekable();
let mut state = PathState::Begin;
let mut word = String::new();
let mut lists = vec![HashMap::<String, usize>::new()];
let mut index = 0;
let is_boundary = |c: char| c == '\n' || c == ' ' || c == ',' || c == '\t';
'machine: loop {
state = match state {
PathState::Begin => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
// Ignore leading path separators to not trigger the logic of advancing
// the component count
Some('/') => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::PathComponent => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphanumeric() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some('/') => PathState::PathSeparator,
Some(c) => {
if index > 0 && word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
word.clear();
index = 0;
if is_boundary(c) {
PathState::Boundary
} else {
PathState::NonWord
}
}
},
PathState::PathSeparator => {
if word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
word.clear();
index += 1;
if lists.get(index).is_none() {
lists.push(HashMap::new());
}
// Ignore multiple separators
while chars.next_if(|c| *c == '/').is_some() {}
match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() || c == '_' => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => {
index = 0;
PathState::Boundary
}
Some(_) => {
index = 0;
PathState::NonWord
}
}
}
PathState::Boundary => match chars.next() {
None => PathState::End,
Some(c) if c.is_ascii_alphabetic() => {
word.push(c);
PathState::PathComponent
}
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::NonWord => match chars.next() {
None => PathState::End,
Some(c) if is_boundary(c) => PathState::Boundary,
Some(_) => PathState::NonWord,
},
PathState::End => {
if word.len() >= min_length {
let list = &mut lists[index];
list.entry(word.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
break 'machine;
}
}
}
for i in 0..lists.len() {
print!("Word {i}, Count {i},");
}
println!();
let mut lines: Vec<Vec<Option<(String, usize)>>> = Vec::new();
for (i, list) in lists.into_iter().enumerate() {
let mut entries = list.into_iter().collect::<Vec<_>>();
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
for (j, (word, count)) in entries.into_iter().enumerate() {
if let Some(line) = lines.get_mut(j) {
while line.len() < i {
line.push(None);
}
line.push(Some((word, count)));
} else {
let mut line = Vec::new();
while line.len() < i {
line.push(None);
}
line.push(Some((word, count)));
lines.push(line);
}
}
}
for line in lines.iter() {
for cell in line.iter() {
if let Some((word, count)) = cell {
print!("{},{},", word, count);
} else {
print!(",,");
}
}
println!();
}
}
#[derive(Copy, Clone, Debug)]
enum State {
Begin,
NonWord,
Word,
End,
}
#[tracing::instrument(skip_all)]
pub(crate) async fn run(_ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
let path = matches
.get_one::<PathBuf>("file")
.expect("missing required parameter");
let algorithm = matches
.get_one::<Algorithm>("algorithm")
.expect("parameter has default");
let min_length = matches
.get_one::<usize>("min-length")
.copied()
.expect("paramter has default");
let content = fs::read_to_string(&path)
.await
.wrap_err_with(|| format!("Failed to read file '{}'", path.display()))?;
let mut chars = content.chars();
if *algorithm == Algorithm::Paths {
algorithm_path_components(chars, min_length);
return Ok(());
}
let mut state = State::Begin;
let mut word = String::new();
let mut visited = HashMap::new();
'machine: loop {
state = match state {
State::Begin => match chars.next() {
None => State::End,
Some(c) if algorithm.is_start(c) => {
word.push(c);
State::Word
}
Some(_) => State::NonWord,
},
State::End => break 'machine,
State::NonWord => match chars.next() {
None => State::End,
Some(c) if algorithm.is_body(c) => {
word.push(c);
State::Word
}
Some(_) => State::NonWord,
},
State::Word => match chars.next() {
None => {
if word.len() >= min_length && algorithm.is_length(word.len()) {
visited
.entry(word.clone())
.and_modify(|v| *v += 1)
.or_insert(1);
}
State::End
}
Some(c) if algorithm.is_body(c) => {
word.push(c);
State::Word
}
Some(_) => {
if word.len() >= min_length && algorithm.is_length(word.len()) {
visited
.entry(word.clone())
.and_modify(|v| *v += 1)
.or_insert(1);
}
word.clear();
State::NonWord
}
},
}
}
let mut entries: Vec<(String, usize)> = visited.into_iter().collect();
// Reverse sides during comparison to get "highest to lowest"
entries.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
entries
.iter()
.for_each(|(word, count)| println!("{:016} {}", word, count));
Ok(())
}

View file

@ -0,0 +1,26 @@
use clap::{ArgMatches, Command};
use color_eyre::Result;
mod brute_force_words;
mod extract_words;
pub(crate) fn command_definition() -> Command {
Command::new("experiment")
.subcommand_required(true)
.about("A collection of utilities and experiments.")
.subcommand(brute_force_words::command_definition())
.subcommand(extract_words::command_definition())
}
#[tracing::instrument(skip_all)]
pub(crate) async fn run(ctx: sdk::Context, matches: &ArgMatches) -> Result<()> {
match matches.subcommand() {
// It's fine to block here, as this is the only thing that's executing on the runtime.
// The other option with `spawn_blocking` would require setting up values to be Send+Sync.
Some(("brute-force-words", sub_matches)) => brute_force_words::run(ctx, sub_matches),
Some(("extract-words", sub_matches)) => extract_words::run(ctx, sub_matches).await,
_ => unreachable!(
"clap is configured to require a subcommand, and they're all handled above"
),
}
}

View file

@ -21,6 +21,7 @@ mod cmd {
pub mod build;
pub mod bundle;
pub mod dictionary;
pub mod experiment;
pub mod migrate;
pub mod murmur;
pub mod new;
@ -56,6 +57,7 @@ async fn main() -> Result<()> {
.subcommand(cmd::build::command_definition())
.subcommand(cmd::bundle::command_definition())
.subcommand(cmd::dictionary::command_definition())
.subcommand(cmd::experiment::command_definition())
.subcommand(cmd::migrate::command_definition())
.subcommand(cmd::murmur::command_definition())
.subcommand(cmd::new::command_definition())
@ -133,6 +135,7 @@ async fn main() -> Result<()> {
Some(("build", sub_matches)) => cmd::build::run(ctx, sub_matches).await?,
Some(("bundle", sub_matches)) => cmd::bundle::run(ctx, sub_matches).await?,
Some(("dictionary", sub_matches)) => cmd::dictionary::run(ctx, sub_matches).await?,
Some(("experiment", sub_matches)) => cmd::experiment::run(ctx, sub_matches).await?,
Some(("migrate", sub_matches)) => cmd::migrate::run(ctx, sub_matches).await?,
Some(("murmur", sub_matches)) => cmd::murmur::run(ctx, sub_matches).await?,
Some(("new", sub_matches)) => cmd::new::run(ctx, sub_matches).await?,