From f76acf5407b5d5084b991696815cff4b9ff92049 Mon Sep 17 00:00:00 2001 From: Lucas Schwiderski Date: Wed, 28 Dec 2022 19:44:53 +0100 Subject: [PATCH] fix: Fix serializing Unicode The index operator doesn't use the `char` boundaries, but rather byte boundaries. So I switched back to a simpler, but slightly less efficient loop that simply adds individual characters to the output. It also doesn't escape Unicode anymore, as this shouldn't be an issue in UTF-8 encoded output files. --- CHANGELOG.adoc | 4 ++++ src/ser.rs | 29 ++++++----------------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index 1741d61..aad2451 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -6,6 +6,10 @@ == [Unreleased] +=== Fixed + +- fix serializing Unicode + == [v0.2.0] - 2022-11-25 === Added diff --git a/src/ser.rs b/src/ser.rs index 51c9fd5..a10c566 100644 --- a/src/ser.rs +++ b/src/ser.rs @@ -121,48 +121,31 @@ impl<'a> serde::ser::Serializer for &'a mut Serializer { fn serialize_str(self, v: &str) -> Result { self.ensure_top_level_struct()?; + let needs_escapes = v.is_empty() || v.contains([' ', '\n', '\r', '\t', '=', '\'', '"', '\\', '/']); + if needs_escapes { self.output += "\""; - let len = v.len(); - let chars = v.chars(); - let mut start = 0; - - for (i, c) in chars.enumerate() { - if ('\x20'..='\x7e').contains(&c) - && !['\t', '\n', '\r', '\"', '\\', '/'].contains(&c) - { - continue; - } - - self.output += &v[start..i]; - self.output.push('\\'); - + for c in v.chars() { match c { '\t' => { + self.output.push('\\'); self.output.push('t'); } '\n' => { + self.output.push('\\'); self.output.push('n'); } '\r' => { + self.output.push('\\'); self.output.push('r'); } - '\x7f'.. => { - self.output += &format!("u{:4x}", c as u32); - } c => { self.output.push(c); } }; - - start = i + 1; - } - - if start < len { - self.output += &v[start..]; } self.output += "\"";