diff --git a/Cargo.toml b/Cargo.toml index 9444efa..3539fb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ edition = "2018" [dependencies] linked-hash-map = "0.5.3" +encoding = "0.2" [dev-dependencies] quickcheck = "0.9" diff --git a/src/yaml.rs b/src/yaml.rs index 4bb70da..bcb4d9c 100644 --- a/src/yaml.rs +++ b/src/yaml.rs @@ -157,6 +157,19 @@ impl MarkedEventReceiver for YamlLoader { } } +#[derive(Debug)] +pub enum LoadError { + IO(std::io::Error), + Scan(ScanError), + Decode(std::borrow::Cow<'static, str>), +} + +impl From for LoadError { + fn from(error: std::io::Error) -> Self { + LoadError::IO(error) + } +} + impl YamlLoader { fn insert_new_node(&mut self, node: (Yaml, usize)) { // valid anchor id starts from 1 @@ -199,6 +212,62 @@ impl YamlLoader { } } +pub struct YamlDecoder { + source: T, + trap: encoding::types::DecoderTrap, +} + + +impl YamlDecoder { + pub fn read(source: T) -> YamlDecoder { + YamlDecoder{ + source: source, + trap: encoding::DecoderTrap::Strict, + } + } + + pub fn encoding_trap(&mut self, trap: encoding::types::DecoderTrap) -> &mut Self { + self.trap = trap; + self + } + + pub fn decode(&mut self) -> Result, LoadError> { + let mut buffer = Vec::new(); + self.source.read_to_end(&mut buffer)?; + + // Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint. + // If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by + // detect_utf16_endianness. + let (res, _) = encoding::types::decode( + &buffer, + self.trap, + detect_utf16_endianness(&buffer), + ); + let s = res.map_err(LoadError::Decode)?; + YamlLoader::load_from_str(&s).map_err(LoadError::Scan) + } +} + +/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the +/// bytestream starts with BOM codepoint. +/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since +/// in the general case the bytestream could start with a codepoint that uses both bytes. +/// +/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character. +/// This allows the encoding to be deduced by the pattern of null (#x00) characters. +// +/// See spec at https://yaml.org/spec/1.2/spec.html#id2771184 +fn detect_utf16_endianness(b: &[u8]) -> encoding::types::EncodingRef { + if b.len() > 1 && (b[0] != b[1]) { + if b[0] == 0 { + return encoding::all::UTF_16BE; + } else if b[1] == 0 { + return encoding::all::UTF_16LE; + } + } + encoding::all::UTF_8 +} + macro_rules! define_as ( ($name:ident, $t:ident, $yt:ident) => ( pub fn $name(&self) -> Option<$t> { @@ -736,4 +805,83 @@ subcommands3: let s = "[".repeat(10_000) + &"]".repeat(10_000); assert!(YamlLoader::load_from_str(&s).is_err()); } + + #[test] + fn test_read_bom() { + let s = b"\xef\xbb\xbf--- +a: 1 +b: 2.2 +c: [1, 2] +"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16le() { + let s = b"\xff\xfe-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +\x00"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {:?}", doc); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16be() { + let s = b"\xfe\xff\x00-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {:?}", doc); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16le_nobom() { + let s = b"-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +\x00"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {:?}", doc); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_trap() { + let s = b"--- +a\xa9: 1 +b: 2.2 +c: [1, 2] +"; + let out = YamlDecoder::read(s as &[u8]).encoding_trap(encoding::DecoderTrap::Ignore).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {:?}", doc); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } }