From ab310375e26600411da7bc1609fb23e42000786b Mon Sep 17 00:00:00 2001 From: Matthew Suozzo Date: Mon, 24 Feb 2025 13:04:13 -0500 Subject: [PATCH] Extend classify support to npm, maven, crates (#356) --- internal/netclassify/classify.go | 62 +++++++++++++++++++++++ internal/netclassify/classify_test.go | 71 +++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/internal/netclassify/classify.go b/internal/netclassify/classify.go index 7a510d9b..ebde3fa0 100644 --- a/internal/netclassify/classify.go +++ b/internal/netclassify/classify.go @@ -36,6 +36,23 @@ var ( pythonSourceRegex = regexp.MustCompile(`^(?P[\w\.]+)-(?P.+?)(?P\.(zip|tar\.gz|tar\.bz2|tar\.xz|tar\.Z|tar))$`) ) +// NPM +var ( + npmAPIRegex = regexp.MustCompile(`^https://registry\.(npmjs\.org|yarnpkg\.com)/((@[^/]+/)?[^/]+)/([^/]+)/?$`) + npmFileRegex = regexp.MustCompile(`^https://registry\.(npmjs\.org|yarnpkg\.com)/((@[^/]+/)?[^/]+)/-/([^/]+)-([^/-]+)\.tgz$`) +) + +// Maven +var ( + mavenRegex = regexp.MustCompile(`^https?://(repo1\.maven\.org/maven2|plugins.gradle.org/m2)/(.+)/([^/]+)/([^/]+)/([^/]+)$`) +) + +// Crates (Rust) +var ( + cratesAPIRegex = regexp.MustCompile(`^https?://crates\.io/api/v1/crates/([^/]+)/([^/]+)(?:/\w+)?$`) + cratesFileRegex = regexp.MustCompile(`^https?://crates\.io/api/v1/crates/([^/]+)/([^/]+)/download$`) +) + // GCS var ( // https://cloud.google.com/storage/docs/json_api @@ -83,6 +100,16 @@ func ClassifyURL(rawURL string) (string, error) { return classifyPyPIURL(rawURL) } else if pypiAPIRegex.MatchString(rawURL) { return "", ErrSkipped + } else if npmFileRegex.MatchString(rawURL) { + return classifyNPMURL(rawURL) + } else if npmAPIRegex.MatchString(rawURL) { + return "", ErrSkipped + } else if cratesFileRegex.MatchString(rawURL) { + return classifyCratesURL(rawURL) + } else if cratesAPIRegex.MatchString(rawURL) { + return "", ErrSkipped + } else if mavenRegex.MatchString(rawURL) { + return classifyMavenURL(rawURL) } else if gcsJSONRegex.MatchString(rawURL) { return classifyGCSURL(rawURL, gcsJSONRegex) } else if gcsXMLRegex.MatchString(rawURL) { @@ -202,6 +229,41 @@ func classifyPyPIFile(fname string) (string, error) { } } +func classifyNPMURL(rawURL string) (string, error) { + matches := npmFileRegex.FindStringSubmatch(rawURL) + if len(matches) < 5 { + return "", errors.New("invalid NPM download URL format") + } + packagePath := matches[2] + version := matches[5] + return fmt.Sprintf("pkg:npm/%s@%s", packagePath, version), nil +} + +func classifyCratesURL(rawURL string) (string, error) { + matches := cratesFileRegex.FindStringSubmatch(rawURL) + if len(matches) < 3 { + return "", errors.New("invalid Cargo URL format") + } + name := matches[1] + version := matches[2] + return fmt.Sprintf("pkg:cargo/%s@%s", name, version), nil +} + +func classifyMavenURL(rawURL string) (string, error) { + matches := mavenRegex.FindStringSubmatch(rawURL) + if len(matches) < 6 { + return "", errors.New("invalid Maven URL format") + } + pathSegments := strings.Split(matches[2], "/") + if len(pathSegments) < 2 { + return "", errors.New("invalid Maven path format") + } + name := matches[3] + version := matches[4] + namespace := strings.Join(pathSegments, ".") + return fmt.Sprintf("pkg:maven/%s/%s@%s", namespace, name, version), nil +} + func classifyGCSURL(rawURL string, pattern *regexp.Regexp) (string, error) { matches := pattern.FindStringSubmatch(rawURL) bucket, object := matches[pattern.SubexpIndex("bucket")], matches[pattern.SubexpIndex("object")] diff --git a/internal/netclassify/classify_test.go b/internal/netclassify/classify_test.go index f2cc407b..dc128ac9 100644 --- a/internal/netclassify/classify_test.go +++ b/internal/netclassify/classify_test.go @@ -97,6 +97,77 @@ func TestClassifyURL(t *testing.T) { wantErr: ErrUnclassified, }, + // NPM test cases + { + name: "npm_download_simple", + url: "https://registry.npmjs.org/express/-/express-4.17.1.tgz", + want: "pkg:npm/express@4.17.1", + }, + { + name: "npm_download_scoped", + url: "https://registry.npmjs.org/@invisionag/eslint-config-ivx/-/eslint-config-ivx-0.0.2.tgz", + want: "pkg:npm/@invisionag/eslint-config-ivx@0.0.2", + }, + { + name: "npm_yarn_download_simple", + url: "https://registry.yarnpkg.com/express/-/express-4.17.1.tgz", + want: "pkg:npm/express@4.17.1", + }, + { + name: "npm_api_scoped", + url: "https://registry.npmjs.org/@esbuild/freebsd-arm64/0.21.5", + wantErr: ErrSkipped, + }, + { + name: "npm_api_simple", + url: "https://registry.npmjs.org/express/4.17.1", + wantErr: ErrSkipped, + }, + { + name: "npm_yarn_api_simple", + url: "https://registry.yarnpkg.com/express/4.17.1", + wantErr: ErrSkipped, + }, + + // Maven test cases + { + name: "maven_central_artifact", + url: "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar", + want: "pkg:maven/org.apache.commons/commons-lang3@3.12.0", + }, + { + name: "maven_with_classifier", + url: "https://repo1.maven.org/maven2/org/apache/spark/spark-core_2.12/3.1.2/spark-core_2.12-3.1.2-tests.jar", + want: "pkg:maven/org.apache.spark/spark-core_2.12@3.1.2", + }, + { + name: "maven_gradle_plugin_repo_artifact", + url: "https://plugins.gradle.org/m2/com/google/protobuf/com.google.protobuf.gradle.plugin/0.9.4/com.google.protobuf.gradle.plugin-0.9.4.pom", + want: "pkg:maven/com.google.protobuf/com.google.protobuf.gradle.plugin@0.9.4", + }, + + // Crates (Rust) test cases + { + name: "crates_download", + url: "https://crates.io/api/v1/crates/rand/0.7.2/download", + want: "pkg:cargo/rand@0.7.2", + }, + { + name: "crates_api_package", + url: "https://crates.io/api/v1/crates/rand", + wantErr: ErrUnclassified, + }, + { + name: "crates_api", + url: "https://crates.io/api/v1/crates/rand/0.7.2", + wantErr: ErrSkipped, + }, + { + name: "crates_api_deps", + url: "https://crates.io/api/v1/crates/rand/0.7.2/dependencies", + wantErr: ErrSkipped, + }, + // gcs URL tests { name: "valid GCS URL",