Skip to content

Commit

Permalink
New API to get anchors and aliases from a document
Browse files Browse the repository at this point in the history
When deserializing a YAML document, the location and names of anchors
and aliases is not preserved. That information is useful, though, as
users of the library may want to deduplicate references to the same
mapping.

This commit introduces an API that exposes such mappings by parsing
the metadata in `struct Document`. The new API has been added to
`struct Deserializer` and provides the following signature:

   `pub fn anchors(&self) -> Option<Vec<DocumentAnchor>>`

As an example, given the following YAML document:
```
a:
    enum: &io
        INPUT: 0
        OUTPUT: 1
b:
    enum: *io
c:
    enum: *io
```

this API returns:
```
Some([DocumentAnchor {
    anchor_name: "io",
    anchor_path: "/a/enum",
    aliases: ["/b/enum", "/c/enum"],
}])
  • Loading branch information
lucasvr committed Dec 2, 2024
1 parent d770a16 commit 59ee15f
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 5 deletions.
109 changes: 109 additions & 0 deletions src/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use serde::de::{
self, value::StrDeserializer, Deserialize, DeserializeOwned,
DeserializeSeed, Expected, IgnoredAny, Unexpected, Visitor,
};
use std::collections::BTreeMap;
use std::fmt::Debug;
use std::fmt::Formatter;
use std::fmt::Result as FmtResult;
Expand Down Expand Up @@ -122,6 +123,51 @@ impl Debug for Progress<'_> {
}
}

/// A structure that describes anchors and aliases in a YAML document.
/// The anchor name, prefixed in the YAML document with "&", is represented
/// without the prefix in `anchor_name`.
/// The `anchor_path` is a string that denotes the path to the anchor in the YAML
/// document. Each key that form the path is separated from one another by "/".
/// The `aliases` vector contains the path to each reference to the anchor in
/// the YAML document.
///
/// # Examples
///
/// The following YAML document:
/// ```yaml
/// a:
/// enum: &io
/// INPUT: 0
/// OUTPUT: 1
/// b:
/// enum: *io
/// c:
/// enum: *io
/// ```
///
/// Is represented by the following `DocumentAnchor`:
/// ```json
/// DocumentAnchor {
/// anchor_name: "io",
/// anchor_path: "/a/enum",
/// aliases: ["/b/enum", "/c/enum"],
/// }
/// ```
#[derive(Debug, PartialEq)]
pub struct DocumentAnchor {
/// The name of the anchor, without the "&" prefix.
pub anchor_name: String,

/// The path to the anchor in the YAML document, with keys separated by "/".
/// A "/" at the beginning of the path denotes the root of the YAML document.
pub anchor_path: String,

/// The path to each alias that references the anchor in the YAML document.
/// Each key that form the alias path is separated by "/".
/// A "/" at the beginning of the path denotes the root of the YAML document.
pub aliases: Vec<String>,
}

impl<'de> Deserializer<'de> {
/// Deserializes an instance of type `T` from a string of YAML text.
///
Expand Down Expand Up @@ -233,6 +279,69 @@ impl<'de> Deserializer<'de> {
Deserializer { progress }
}

/// Gets a vector of anchors, aliases, and where they occur in the YAML document.
pub fn anchors(&self) -> Option<Vec<DocumentAnchor>> {
let document = match &self.progress {
Progress::Document(doc) => doc,
_ => return None,
};

let mut aliases = BTreeMap::<usize, Vec<usize>>::new();
for (i, event) in document.events.iter().enumerate() {
if let (Event::Alias(id), _) = event {
aliases.entry(*id).or_default().push(i);
}
}

let mut anchors = Vec::new();
for (alias_id, document_index) in &document.anchor_event_map {
let anchor_name = document.anchor_names.get(alias_id).unwrap();
let anchor_path = self.event_path(*document_index);
let mut anchors_aliases = Vec::new();
for alias_index in aliases.get(alias_id).unwrap_or(&Vec::new()) {
anchors_aliases.push(self.event_path(*alias_index));
}

anchors.push(DocumentAnchor {
anchor_name: anchor_name.clone(),
anchor_path: anchor_path.clone(),
aliases: anchors_aliases,
});
}

Some(anchors)
}

fn event_path(&self, event_index: usize) -> String {
let mut mapping_end = 0u32;
let mut process_scalar = true;
let mut path = Vec::new();

if let Progress::Document(document) = &self.progress {
for i in (0..=event_index).rev() {
let event = &document.events[i];
match &event.0 {
Event::MappingEnd => mapping_end += 1,
Event::MappingStart(_) => {
if mapping_end > 0 {
mapping_end -= 1;
} else {
process_scalar = true;
}
}
Event::Scalar(scalar) => {
if process_scalar {
path.insert(0, String::from_utf8_lossy(&scalar.value).to_string());
process_scalar = false;
}
}
_ => {}
}
}
}
format!("/{}", path.join("/"))
}

fn de<T>(
self,
f: impl for<'document> FnOnce(
Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@
#![crate_type = "lib"]

// Re-export commonly used items from other modules
pub use crate::de::{from_reader, from_slice, from_str, Deserializer}; // Deserialization functions
pub use crate::de::{
from_reader, from_slice, from_str, Deserializer, DocumentAnchor
}; // Deserialization functions
pub use crate::modules::error::{Error, Location, Result}; // Error handling types
pub use crate::ser::{to_string, to_writer, Serializer, State}; // Serialization functions
#[doc(inline)]
Expand Down
25 changes: 21 additions & 4 deletions src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{
de::{Event, Progress},
libyml::{
error::Mark,
parser::{Event as YamlEvent, Parser},
parser::{Anchor, Event as YamlEvent, Parser},
},
modules::error::{self, Error, ErrorImpl, Result},
};
Expand Down Expand Up @@ -57,6 +57,11 @@ pub struct Document<'input> {
/// encountered during parsing, its id is used to look up the index of the corresponding
/// event in the `events` vector.
pub anchor_event_map: BTreeMap<usize, usize>,

/// Map from alias id to name.
///
/// This field is a `BTreeMap` that maps alias ids to their corresponding names.
pub anchor_names: BTreeMap<usize, String>,
}

impl<'input> Loader<'input> {
Expand Down Expand Up @@ -140,6 +145,14 @@ impl<'input> Loader<'input> {
events: Vec::new(),
error: None,
anchor_event_map: BTreeMap::new(),
anchor_names: BTreeMap::new(),
};

let anchor_name = |anchor: &Anchor| {
format!("{:?}", anchor)
.trim_start_matches("\"")
.trim_end_matches("\"")
.to_owned()
};

loop {
Expand All @@ -165,6 +178,7 @@ impl<'input> Loader<'input> {
}
YamlEvent::DocumentStart => continue,
YamlEvent::DocumentEnd => return Some(document),

YamlEvent::Alias(alias) => match anchors.get(&alias) {
Some(id) => Event::Alias(*id),
None => {
Expand All @@ -178,31 +192,34 @@ impl<'input> Loader<'input> {
YamlEvent::Scalar(mut scalar) => {
if let Some(anchor) = scalar.anchor.take() {
let id = anchors.len();
anchors.insert(anchor, id);
document.anchor_names.insert(id, anchor_name(&anchor));
document
.anchor_event_map
.insert(id, document.events.len());
anchors.insert(anchor, id);
}
Event::Scalar(scalar)
}
YamlEvent::SequenceStart(mut sequence_start) => {
if let Some(anchor) = sequence_start.anchor.take() {
let id = anchors.len();
anchors.insert(anchor, id);
document.anchor_names.insert(id, anchor_name(&anchor));
document
.anchor_event_map
.insert(id, document.events.len());
anchors.insert(anchor, id);
}
Event::SequenceStart(sequence_start)
}
YamlEvent::SequenceEnd => Event::SequenceEnd,
YamlEvent::MappingStart(mut mapping_start) => {
if let Some(anchor) = mapping_start.anchor.take() {
let id = anchors.len();
anchors.insert(anchor, id);
document.anchor_names.insert(id, anchor_name(&anchor));
document
.anchor_event_map
.insert(id, document.events.len());
anchors.insert(anchor, id);
}
Event::MappingStart(mapping_start)
}
Expand Down
26 changes: 26 additions & 0 deletions tests/test_de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ mod tests {
loader::Loader,
modules::error::ErrorImpl,
Deserializer, Number,
DocumentAnchor,
Value::{self, String as SerdeString},
};
use std::{
Expand Down Expand Up @@ -96,6 +97,31 @@ mod tests {
test_de(yaml, &expected);
}

#[test]
fn test_anchor_api() {
let yaml = indoc! {"
---
a:
enum: &io
INPUT: 0
OUTPUT: 1
b:
enum: *io
c:
enum: *io
"};
let mut deserializer = Deserializer::from_str(yaml);
let document = deserializer.next().unwrap();
let anchors = document.anchors().unwrap_or_default();
let expected = DocumentAnchor {
anchor_name: "io".into(),
anchor_path: "/a/enum".into(),
aliases: ["/b/enum".into(), "/c/enum".into()].to_vec(),
};
assert_eq!(anchors.len(), 1);
assert_eq!(anchors[0], expected);
}

#[test]
/// Test borrowed strings with different YAML representations.
fn test_borrowed() {
Expand Down

0 comments on commit 59ee15f

Please sign in to comment.