From 58e27848b6523f2bb633e7c0144f9550aa36415d Mon Sep 17 00:00:00 2001 From: Sushant Date: Sat, 16 Jul 2022 13:27:57 +0530 Subject: [PATCH] feat(model): Add linearsvc agent --- README.md | 3 ++ atarashi/agents/linearsvc.py | 90 +++++++++++++++++++++++++++++++++ atarashi/atarashii.py | 5 +- atarashi/evaluator/evaluator.py | 2 +- pyproject.toml | 5 +- requirements.txt | 3 +- setup.py | 3 +- 7 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 atarashi/agents/linearsvc.py diff --git a/README.md b/README.md index 8ed5f85..191e1c3 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help` - Running **wordFrequencySimilarity** agent `atarashi -a wordFrequencySimilarity /path/to/file.c` +- Running **linearsvc** agent + + `atarashi -a linearsvc /path/to/file.c` - Running **tfidf** agent - With **Cosine similarity** diff --git a/atarashi/agents/linearsvc.py b/atarashi/agents/linearsvc.py new file mode 100644 index 0000000..0bf67f3 --- /dev/null +++ b/atarashi/agents/linearsvc.py @@ -0,0 +1,90 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Copyright 2022 Sushant Kumar (sushantmishra02102002@gmail.com) +SPDX-License-Identifier: GPL-2.0 +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +version 2 as published by the Free Software Foundation. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +__author__ = 'Sushant Kumar' +__email__ = 'sushantmishra02102002@gmail.com' + +import argparse + +from atarashi.agents.atarashiAgent import AtarashiAgent +from atarashi.libs.initialmatch import spdx_identifer +from linearsvc import linearsvc + + +class Linearsvc(AtarashiAgent): + + def __init__(self, licenseList): + super().__init__(licenseList) + + def predict_shortname(self, processed_comment): + ''' + :param filePath: extracted and preprocessed comment + :return: Returns the predicted license's short name + ''' + + processed_comment = [processed_comment] + classifier = linearsvc(processed_comment) + predictor = classifier.classify() + return predictor.predict(processed_comment) + + def scan(self, filePath): + ''' + Read the content of filename, extract the comments and preprocess them. + Find the predicted short name for the preprocessed file. + :param filePath: Path of the file to scan + :return: Returns the license's short name + ''' + + match = [] + + with open(filePath) as file: + raw_data = file.read() + + spdx_identifers = spdx_identifer(raw_data, + self.licenseList['shortname']) + if spdx_identifers: + match.extend(spdx_identifers) + else: + processed_comment = super().loadFile(filePath) + license_name = self.predict_shortname(processed_comment) + + match.append({ + 'shortname': str(license_name[0]), + 'sim_score': 1.0, + 'sim_type': 'linearsvc', + 'description': '', + }) + return match + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('processedLicenseList', + help='Specify the processed license list file') + parser.add_argument('inputFile', + help='Specify the input file which needs to be scanned' + ) + + args = parser.parse_args() + + licenseList = args.processedLicenseList + filename = args.inputFile + + scanner = Linearsvc(licenseList) + scanner.scan(filename) diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py index e551ab3..e929adb 100644 --- a/atarashi/atarashii.py +++ b/atarashi/atarashii.py @@ -28,6 +28,7 @@ from atarashi.agents.dameruLevenDist import DameruLevenDist from atarashi.agents.tfidf import TFIDF from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity +from atarashi.agents.linearsvc import Linearsvc __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" @@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim", scanner = WordFrequencySimilarity(processedLicense) elif agent_name == "DLD": scanner = DameruLevenDist(processedLicense) + elif agent_name == "linearsvc": + scanner = Linearsvc(processedLicense) elif agent_name == "tfidf": scanner = TFIDF(processedLicense) if similarity == "CosineSim": @@ -128,7 +131,7 @@ def main(): parser.add_argument("-l", "--processedLicenseList", required=False, help="Specify the location of processed license list file") parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], + choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'], help="Name of the agent that needs to be run") parser.add_argument("-s", "--similarity", required=False, default="CosineSim", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py index 64ecce6..7a9eb9a 100755 --- a/atarashi/evaluator/evaluator.py +++ b/atarashi/evaluator/evaluator.py @@ -118,7 +118,7 @@ def evaluate(scanner): defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json") parser = argparse.ArgumentParser() parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], + choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'], help="Name of the agent that needs to be run") parser.add_argument("-s", "--similarity", required=False, default="CosineSim", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], diff --git a/pyproject.toml b/pyproject.toml index 5160d31..370df09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,10 +5,11 @@ requires = [ "numpy>=1.16.0", "tqdm>=4.42.0", "pandas>=0.23.1", - "scikit-learn>=0.18.1", + "scikit-learn==1.1.1", "scipy>=0.18.1", "textdistance>=3.0.3", "pyxDamerauLevenshtein>=1.5", "nirjas>=0.0.5", - "urllib3>=1.24.1" + "urllib3>=1.24.1", + "linearsvc>=0.1.1" ] diff --git a/requirements.txt b/requirements.txt index d77b15d..1975905 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ tqdm>=4.42.0 pandas>=0.23.1 pyxDamerauLevenshtein>=1.5 -scikit-learn>=0.18.1 +scikit-learn==1.1.1 scipy>=0.18.1 spacy>=2.0.11 textdistance>=3.0.3 setuptools>=39.2.0 nirjas>=0.0.5 urllib3>=1.24.1 +linearsvc>=0.1.1 diff --git a/setup.py b/setup.py index d468589..d578658 100755 --- a/setup.py +++ b/setup.py @@ -68,7 +68,8 @@ def read(fname): 'textdistance>=3.0.3', 'pyxDamerauLevenshtein>=1.5', 'urllib3>=1.24.1', - 'nirjas>=0.0.5' + 'nirjas>=0.0.5', + 'linearsvc>=0.1.1' ] class BuildAtarashiDependencies(distutils.cmd.Command):