diff --git a/CHANGELOG.md b/CHANGELOG.md index cdedde2..f7ada78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - [PR#63](https://github.com/EmbarkStudios/spdx/pull/63) update SPDX license list to 3.22. +### Added +- [PR#64](https://github.com/EmbarkStudios/spdx/pull/64) resolved [#64](https://github.com/EmbarkStudios/spdx/issues/64) by adding `Expression::canonicalize` which fixes otherwise valid expressions into a form parsable with `ParseMode::STRICT` + ## [0.10.2] - 2023-07-14 ### Changed - [PR#61](https://github.com/EmbarkStudios/spdx/pull/61) updated the SPDX license list from `3.20` => `3.21`. diff --git a/src/expression/parser.rs b/src/expression/parser.rs index d12e087..746f97a 100644 --- a/src/expression/parser.rs +++ b/src/expression/parser.rs @@ -26,6 +26,71 @@ impl Expression { Self::parse_mode(original, ParseMode::STRICT) } + /// Canonicalizes the input expression into a form that can be parsed with + /// [`ParseMode::STRICT`] + /// + /// ## Transforms + /// + /// 1. '/' is replaced with ' OR ' + /// 1. Lower-cased operators ('or', 'and', 'with') are upper-cased + /// 1. '+' is tranformed to `-or-later` for GNU licenses + /// 1. Invalid/imprecise license identifiers (eg. `apache2`) are replaced + /// with their valid identifiers + /// + /// If the provided expression is not modified then `None` is returned + /// + /// Note that this only does fixup of otherwise valid expressions, passing + /// the resulting string to [`Expression::parse`] can still result in + /// additional parse errors, eg. unbalanced parentheses + /// + /// ``` + /// assert_eq!(spdx::Expression::canonicalize("apache with LLVM-exception/gpl-3.0+").unwrap().unwrap(), "Apache-2.0 WITH LLVM-exception OR GPL-3.0-or-later"); + /// ``` + pub fn canonicalize(original: &str) -> Result, ParseError> { + let mut can = String::with_capacity(original.len()); + + let lexer = Lexer::new_mode(original, ParseMode::LAX); + + // Keep track if the last license id is a GNU license that uses the -or-later + // convention rather than the + like all other licenses + let mut last_is_gnu = false; + for tok in lexer { + let tok = tok?; + + match tok.token { + Token::Spdx(id) => { + last_is_gnu = id.is_gnu(); + can.push_str(id.name); + } + Token::And => can.push_str(" AND "), + Token::Or => can.push_str(" OR "), + Token::With => can.push_str(" WITH "), + Token::Plus => { + if last_is_gnu { + can.push_str("-or-later"); + } else { + can.push('+'); + } + } + Token::OpenParen => can.push('('), + Token::CloseParen => can.push(')'), + Token::Exception(exc) => can.push_str(exc.name), + Token::LicenseRef { doc_ref, lic_ref } => { + if let Some(dr) = doc_ref { + can.push_str("DocumentRef-"); + can.push_str(dr); + can.push(':'); + } + + can.push_str("LicenseRef-"); + can.push_str(lic_ref); + } + } + } + + Ok((can != original).then_some(can)) + } + /// Parses an expression with the specified `ParseMode`. With /// `ParseMode::Lax` it permits some non-SPDX syntax, such as imprecise /// license names and "/" used instead of "OR" in exprssions. diff --git a/src/lexer.rs b/src/lexer.rs index afd38c5..41ff1af 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -9,19 +9,23 @@ pub struct ParseMode { /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in /// the SPDX spec, but enabling this option allows them to be lowercased pub allow_lower_case_operators: bool, - /// Allows the use of `/` as a synonym for the `OR` operator. This also - /// allows for not having whitespace between the `/` and the terms on either - /// side + /// Allows the use of `/` as a synonym for the `OR` operator. + /// + /// This also allows for not having whitespace between the `/` and the terms + /// on either side pub allow_slash_as_or_operator: bool, /// Allows some invalid/imprecise identifiers as synonyms for an actual - /// license identifier. See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) - /// for a list of the current synonyms. Note that this list is not - /// comprehensive but can be expanded upon when invalid identifiers are - /// found in the wild. + /// license identifier. + /// + /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list + /// of the current synonyms. Note that this list is not comprehensive but + /// can be expanded upon when invalid identifiers are found in the wild. pub allow_imprecise_license_names: bool, /// The various GPL licenses diverge from every other license in the SPDX - /// license list by having an `-or-later` variant that used as a suffix on a - /// base license (eg. `GPL-3.0-or-later`) rather than the canonical `GPL-3.0+`. + /// license list by having an `-or-later` variant that is used as a suffix + /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical + /// `GPL-3.0+`. + /// /// This option just allows GPL licenses to be treated similarly to all of /// the other SPDX licenses. pub allow_postfix_plus_on_gpl: bool, @@ -63,8 +67,7 @@ impl ParseMode { pub enum Token<'a> { /// A recognized SPDX license id Spdx(LicenseId), - /// A `LicenseRef-` prefixed id, with an optional - /// `DocRef-` + /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-` LicenseRef { doc_ref: Option<&'a str>, lic_ref: &'a str, @@ -250,6 +253,8 @@ impl<'a> Iterator for Lexer<'a> { ok_token(Token::And) } else if self.mode.allow_lower_case_operators && m == "or" { ok_token(Token::Or) + } else if self.mode.allow_lower_case_operators && m == "with" { + ok_token(Token::With) } else if let Some(lic_id) = crate::license_id(m) { ok_token(Token::Spdx(lic_id)) } else if let Some(exc_id) = crate::exception_id(m) { diff --git a/src/lib.rs b/src/lib.rs index 79d35a5..d8df52f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -495,6 +495,7 @@ pub fn license_id(name: &str) -> Option { } /// Find license partially matching the name, e.g. "apache" => "Apache-2.0" +/// /// Returns length (in bytes) of the string matched. Garbage at the end is /// ignored. See /// [`identifiers::IMPRECISE_NAMES`](identifiers/constant.IMPRECISE_NAMES.html) @@ -510,11 +511,7 @@ pub fn imprecise_license_id(name: &str) -> Option<(LicenseId, usize)> { for (prefix, correct_name) in identifiers::IMPRECISE_NAMES { if let Some(name_prefix) = name.as_bytes().get(0..prefix.len()) { if prefix.as_bytes().eq_ignore_ascii_case(name_prefix) { - let mut len = prefix.len(); - if name.as_bytes().get(len).copied() == Some(b'+') { - len += 1; - } - return license_id(correct_name).map(|lic| (lic, len)); + return license_id(correct_name).map(|lic| (lic, prefix.len())); } } } diff --git a/tests/validation.rs b/tests/validation.rs index eaf6808..d2f1bf9 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -224,3 +224,34 @@ fn validates_excessive_parens() { ] ]); } + +#[test] +fn canonicalization() { + use spdx::Expression; + + assert!(Expression::canonicalize("Apache-2.0 OR MIT") + .unwrap() + .is_none()); + assert_eq!( + Expression::canonicalize("Apache-2.0/MIT").unwrap().unwrap(), + "Apache-2.0 OR MIT" + ); + assert_eq!( + Expression::canonicalize("MIT and GPL-3.0+") + .unwrap() + .unwrap(), + "MIT AND GPL-3.0-or-later" + ); + assert_eq!( + Expression::canonicalize("simplified bsd license or gpl-2.0+") + .unwrap() + .unwrap(), + "BSD-2-Clause OR GPL-2.0-or-later" + ); + assert_eq!( + Expression::canonicalize("apache with LLVM-exception/mpl") + .unwrap() + .unwrap(), + "Apache-2.0 WITH LLVM-exception OR MPL-2.0" + ); +}