diff --git a/crates/dtmt/Cargo.toml b/crates/dtmt/Cargo.toml index 69bbc31..688066f 100644 --- a/crates/dtmt/Cargo.toml +++ b/crates/dtmt/Cargo.toml @@ -33,7 +33,10 @@ path-slash = "0.2.1" async-recursion = "1.0.2" notify = "5.1.0" luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" } -shlex = "1.2.0" +shlex = { version = "1.2.0", optional = true } [dev-dependencies] tempfile = "3.3.0" + +[features] +shlex-bench = ["dep:shlex"] diff --git a/crates/dtmt/src/cmd/bundle/extract.rs b/crates/dtmt/src/cmd/bundle/extract.rs index 5e1c03b..9a0f1dd 100644 --- a/crates/dtmt/src/cmd/bundle/extract.rs +++ b/crates/dtmt/src/cmd/bundle/extract.rs @@ -3,7 +3,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; -use color_eyre::eyre::{self, Context, Result}; +use color_eyre::eyre::{self, bail, Context, Result}; use color_eyre::{Help, Report}; use futures::future::try_join_all; use futures::StreamExt; @@ -12,7 +12,9 @@ use sdk::{Bundle, BundleFile, CmdLine}; use tokio::fs; use crate::cmd::util::resolve_bundle_paths; +use crate::shell_parse::ShellParser; +#[inline] fn parse_glob_pattern(s: &str) -> Result { match Pattern::new(s) { Ok(p) => Ok(p), @@ -20,6 +22,7 @@ fn parse_glob_pattern(s: &str) -> Result { } } +#[inline] fn flatten_name(s: &str) -> String { s.replace('/', "_") } @@ -131,26 +134,29 @@ async fn parse_command_line_template(tmpl: &String) -> Result { let mut cmd = if matches!(fs::try_exists(tmpl).await, Ok(true)) { let path = PathBuf::from(tmpl); if path.file_name() == Some(OsStr::new("main.py")) { - let arg = path.display().to_string(); let mut cmd = CmdLine::new("python"); - cmd.arg(shlex::quote(&arg).to_string()); + cmd.arg(path); cmd } else { CmdLine::new(path) } } else { - let Some(args) = shlex::split(tmpl) else { - eyre::bail!("Invalid shell syntax"); - }; + let mut parsed = ShellParser::new(tmpl.as_bytes()); + // Safety: The initial `tmpl` was a `&String` (i.e. valid UTF-8), and `shlex` does not + // insert or remove characters, nor does it split UTF-8 characters. + // So the resulting byte stream is still valid UTF-8. + let mut cmd = CmdLine::new(unsafe { + let bytes = parsed.next().expect("Template is not empty"); + String::from_utf8_unchecked(bytes.to_vec()) + }); - // We already checked that the template is not empty - let mut cmd = CmdLine::new(args[0].clone()); - let mut it = args.iter(); - // Skip the first one, that's the command name - it.next(); + while let Some(arg) = parsed.next() { + // Safety: See above. + cmd.arg(unsafe { String::from_utf8_unchecked(arg.to_vec()) }); + } - for arg in it { - cmd.arg(arg); + if parsed.errored { + bail!("Invalid command line template"); } cmd diff --git a/crates/dtmt/src/main.rs b/crates/dtmt/src/main.rs index bd419e7..2e10b17 100644 --- a/crates/dtmt/src/main.rs +++ b/crates/dtmt/src/main.rs @@ -1,6 +1,7 @@ #![feature(io_error_more)] #![feature(let_chains)] #![feature(result_flattening)] +#![feature(test)] #![windows_subsystem = "console"] use std::path::PathBuf; @@ -27,6 +28,7 @@ mod cmd { mod util; pub mod watch; } +mod shell_parse; #[derive(Default, Deserialize, Serialize)] struct GlobalConfig { diff --git a/crates/dtmt/src/shell_parse.rs b/crates/dtmt/src/shell_parse.rs new file mode 100644 index 0000000..6f35a5f --- /dev/null +++ b/crates/dtmt/src/shell_parse.rs @@ -0,0 +1,189 @@ +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +enum ParserState { + Start, + Word, + SingleQuote, + DoubleQuote, +} + +pub struct ShellParser<'a> { + bytes: &'a [u8], + offset: usize, + pub errored: bool, +} + +impl<'a> ShellParser<'a> { + pub fn new(bytes: &'a [u8]) -> Self { + Self { + bytes, + offset: 0, + errored: false, + } + } + + fn parse_word(&mut self) -> Option<&'a [u8]> { + // The start of the current word. Certain leading characters should be ignored, + // so this might change. + let mut start = self.offset; + let mut state = ParserState::Start; + + while self.offset < self.bytes.len() { + let c = self.bytes[self.offset]; + self.offset += 1; + + match state { + ParserState::Start => match c { + // Ignore leading whitespace + b' ' | b'\t' | b'\n' => start += 1, + b'\'' => { + state = ParserState::SingleQuote; + start += 1; + } + b'"' => { + state = ParserState::DoubleQuote; + start += 1; + } + _ => { + state = ParserState::Word; + } + }, + ParserState::Word => match c { + // Unquoted whitespace ends the current word + b' ' | b'\t' | b'\n' => { + return Some(&self.bytes[start..self.offset - 1]); + } + _ => {} + }, + ParserState::SingleQuote => match c { + b'\'' => { + return Some(&self.bytes[start..(self.offset - 1)]); + } + _ => {} + }, + ParserState::DoubleQuote => match c { + b'"' => { + return Some(&self.bytes[start..(self.offset - 1)]); + } + _ => {} + }, + } + } + + match state { + ParserState::Start => None, + ParserState::Word => Some(&self.bytes[start..self.offset]), + ParserState::SingleQuote | ParserState::DoubleQuote => { + self.errored = true; + None + } + } + } +} + +impl<'a> Iterator for ShellParser<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + self.parse_word() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_one_word() { + let mut it = ShellParser::new(b"hello"); + assert_eq!(it.next(), Some("hello".as_bytes())); + assert_eq!(it.next(), None); + } + + #[test] + fn test_one_single() { + let mut it = ShellParser::new(b"'hello'"); + assert_eq!(it.next(), Some("hello".as_bytes())); + assert_eq!(it.next(), None); + } + + #[test] + fn test_open_quote() { + let mut it = ShellParser::new(b"'hello"); + assert_eq!(it.next(), None); + assert!(it.errored) + } + + #[test] + fn test_ww2ogg() { + let mut it = ShellParser::new( + b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"", + ); + assert_eq!(it.next(), Some("ww2ogg.exe".as_bytes())); + assert_eq!(it.next(), Some("--pcb".as_bytes())); + assert_eq!( + it.next(), + Some("/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin".as_bytes()) + ); + assert_eq!(it.next(), None); + } +} + +#[cfg(test)] +mod bench { + extern crate test; + + use super::*; + #[cfg(feature = "shlex-bench")] + use shlex::bytes::Shlex; + use test::Bencher; + + mod ww2ogg { + use super::*; + + #[bench] + fn custom(b: &mut Bencher) { + let val = test::black_box( + b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"", + ); + b.iter(|| { + let it = ShellParser::new(val); + let _: Vec<_> = test::black_box(it.collect()); + }) + } + + #[cfg(feature = "shlex-bench")] + #[bench] + fn shlex(b: &mut Bencher) { + let val = test::black_box( + b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"", + ); + b.iter(|| { + let it = Shlex::new(val); + let _: Vec<_> = test::black_box(it.collect()); + }) + } + } + + mod one_single { + use super::*; + + #[bench] + fn custom(b: &mut Bencher) { + let val = test::black_box(b"'hello'"); + b.iter(|| { + let it = ShellParser::new(val); + let _: Vec<_> = test::black_box(it.collect()); + }) + } + + #[cfg(feature = "shlex-bench")] + #[bench] + fn shlex(b: &mut Bencher) { + let val = test::black_box(b"'hello'"); + b.iter(|| { + let it = Shlex::new(val); + let _: Vec<_> = test::black_box(it.collect()); + }) + } + } +}