Add simpler shell parser

This obsoletes `shlex`. The quoting turned out unnecessary, and the
splitting supported a lot more than we need. It also forced
unncessary allocations: The splitting doesn't add any characters and
keeps UTF-8 intact, so returning slices from the input is perfectly
possible.
Though this particular implementation will only come to use in the
future, as `CmdLine` still requires that the slices are cloned.

Still, the custom implementation performs about 3x faster.
This commit is contained in:
Lucas Schwiderski 2024-05-14 00:50:01 +02:00
parent 7a1727ff3b
commit 535a30a7ca
Signed by: lucas
GPG key ID: AA12679AAA6DF4D8
4 changed files with 214 additions and 14 deletions

View file

@ -33,7 +33,10 @@ path-slash = "0.2.1"
async-recursion = "1.0.2"
notify = "5.1.0"
luajit2-sys = { path = "../../lib/luajit2-sys", version = "*" }
shlex = "1.2.0"
shlex = { version = "1.2.0", optional = true }
[dev-dependencies]
tempfile = "3.3.0"
[features]
shlex-bench = ["dep:shlex"]

View file

@ -3,7 +3,7 @@ use std::path::{Path, PathBuf};
use std::sync::Arc;
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use color_eyre::eyre::{self, Context, Result};
use color_eyre::eyre::{self, bail, Context, Result};
use color_eyre::{Help, Report};
use futures::future::try_join_all;
use futures::StreamExt;
@ -12,7 +12,9 @@ use sdk::{Bundle, BundleFile, CmdLine};
use tokio::fs;
use crate::cmd::util::resolve_bundle_paths;
use crate::shell_parse::ShellParser;
#[inline]
fn parse_glob_pattern(s: &str) -> Result<Pattern, String> {
match Pattern::new(s) {
Ok(p) => Ok(p),
@ -20,6 +22,7 @@ fn parse_glob_pattern(s: &str) -> Result<Pattern, String> {
}
}
#[inline]
fn flatten_name(s: &str) -> String {
s.replace('/', "_")
}
@ -131,26 +134,29 @@ async fn parse_command_line_template(tmpl: &String) -> Result<CmdLine> {
let mut cmd = if matches!(fs::try_exists(tmpl).await, Ok(true)) {
let path = PathBuf::from(tmpl);
if path.file_name() == Some(OsStr::new("main.py")) {
let arg = path.display().to_string();
let mut cmd = CmdLine::new("python");
cmd.arg(shlex::quote(&arg).to_string());
cmd.arg(path);
cmd
} else {
CmdLine::new(path)
}
} else {
let Some(args) = shlex::split(tmpl) else {
eyre::bail!("Invalid shell syntax");
};
let mut parsed = ShellParser::new(tmpl.as_bytes());
// Safety: The initial `tmpl` was a `&String` (i.e. valid UTF-8), and `shlex` does not
// insert or remove characters, nor does it split UTF-8 characters.
// So the resulting byte stream is still valid UTF-8.
let mut cmd = CmdLine::new(unsafe {
let bytes = parsed.next().expect("Template is not empty");
String::from_utf8_unchecked(bytes.to_vec())
});
// We already checked that the template is not empty
let mut cmd = CmdLine::new(args[0].clone());
let mut it = args.iter();
// Skip the first one, that's the command name
it.next();
while let Some(arg) = parsed.next() {
// Safety: See above.
cmd.arg(unsafe { String::from_utf8_unchecked(arg.to_vec()) });
}
for arg in it {
cmd.arg(arg);
if parsed.errored {
bail!("Invalid command line template");
}
cmd

View file

@ -1,6 +1,7 @@
#![feature(io_error_more)]
#![feature(let_chains)]
#![feature(result_flattening)]
#![feature(test)]
#![windows_subsystem = "console"]
use std::path::PathBuf;
@ -27,6 +28,7 @@ mod cmd {
mod util;
pub mod watch;
}
mod shell_parse;
#[derive(Default, Deserialize, Serialize)]
struct GlobalConfig {

View file

@ -0,0 +1,189 @@
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
enum ParserState {
Start,
Word,
SingleQuote,
DoubleQuote,
}
pub struct ShellParser<'a> {
bytes: &'a [u8],
offset: usize,
pub errored: bool,
}
impl<'a> ShellParser<'a> {
pub fn new(bytes: &'a [u8]) -> Self {
Self {
bytes,
offset: 0,
errored: false,
}
}
fn parse_word(&mut self) -> Option<&'a [u8]> {
// The start of the current word. Certain leading characters should be ignored,
// so this might change.
let mut start = self.offset;
let mut state = ParserState::Start;
while self.offset < self.bytes.len() {
let c = self.bytes[self.offset];
self.offset += 1;
match state {
ParserState::Start => match c {
// Ignore leading whitespace
b' ' | b'\t' | b'\n' => start += 1,
b'\'' => {
state = ParserState::SingleQuote;
start += 1;
}
b'"' => {
state = ParserState::DoubleQuote;
start += 1;
}
_ => {
state = ParserState::Word;
}
},
ParserState::Word => match c {
// Unquoted whitespace ends the current word
b' ' | b'\t' | b'\n' => {
return Some(&self.bytes[start..self.offset - 1]);
}
_ => {}
},
ParserState::SingleQuote => match c {
b'\'' => {
return Some(&self.bytes[start..(self.offset - 1)]);
}
_ => {}
},
ParserState::DoubleQuote => match c {
b'"' => {
return Some(&self.bytes[start..(self.offset - 1)]);
}
_ => {}
},
}
}
match state {
ParserState::Start => None,
ParserState::Word => Some(&self.bytes[start..self.offset]),
ParserState::SingleQuote | ParserState::DoubleQuote => {
self.errored = true;
None
}
}
}
}
impl<'a> Iterator for ShellParser<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
self.parse_word()
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_one_word() {
let mut it = ShellParser::new(b"hello");
assert_eq!(it.next(), Some("hello".as_bytes()));
assert_eq!(it.next(), None);
}
#[test]
fn test_one_single() {
let mut it = ShellParser::new(b"'hello'");
assert_eq!(it.next(), Some("hello".as_bytes()));
assert_eq!(it.next(), None);
}
#[test]
fn test_open_quote() {
let mut it = ShellParser::new(b"'hello");
assert_eq!(it.next(), None);
assert!(it.errored)
}
#[test]
fn test_ww2ogg() {
let mut it = ShellParser::new(
b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"",
);
assert_eq!(it.next(), Some("ww2ogg.exe".as_bytes()));
assert_eq!(it.next(), Some("--pcb".as_bytes()));
assert_eq!(
it.next(),
Some("/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin".as_bytes())
);
assert_eq!(it.next(), None);
}
}
#[cfg(test)]
mod bench {
extern crate test;
use super::*;
#[cfg(feature = "shlex-bench")]
use shlex::bytes::Shlex;
use test::Bencher;
mod ww2ogg {
use super::*;
#[bench]
fn custom(b: &mut Bencher) {
let val = test::black_box(
b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"",
);
b.iter(|| {
let it = ShellParser::new(val);
let _: Vec<_> = test::black_box(it.collect());
})
}
#[cfg(feature = "shlex-bench")]
#[bench]
fn shlex(b: &mut Bencher) {
let val = test::black_box(
b"ww2ogg.exe --pcb \"/usr/share/ww2ogg/packed_cookbook_aoTuV_603.bin\"",
);
b.iter(|| {
let it = Shlex::new(val);
let _: Vec<_> = test::black_box(it.collect());
})
}
}
mod one_single {
use super::*;
#[bench]
fn custom(b: &mut Bencher) {
let val = test::black_box(b"'hello'");
b.iter(|| {
let it = ShellParser::new(val);
let _: Vec<_> = test::black_box(it.collect());
})
}
#[cfg(feature = "shlex-bench")]
#[bench]
fn shlex(b: &mut Bencher) {
let val = test::black_box(b"'hello'");
b.iter(|| {
let it = Shlex::new(val);
let _: Vec<_> = test::black_box(it.collect());
})
}
}
}