From 26b38842e91a8c177ef97d921b17a8c147853510 Mon Sep 17 00:00:00 2001 From: Anatoly Scherbakov Date: Thu, 14 Jan 2021 21:42:11 +0700 Subject: [PATCH] Implement draft functionality and some tests --- README.md | 55 ++++++++++++++++------ tests/__init__.py | 0 tests/conftest.py | 9 ++++ tests/data/config.yaml | 10 ++++ tests/data/orders/orders.csv | 7 +++ tests/data/orders/orders.yaml | 3 ++ tests/data/people.csv | 3 ++ tests/parse_csv.py | 9 ++++ tests/test_cli/__init__.py | 0 tests/test_cli/test_cli.py | 31 +++++++++++++ tests/test_example/test_some_function.py | 16 ------- tests/test_vendetta/test_vendetta.py | 37 +++++++++++++++ vendetta/__init__.py | 3 +- vendetta/cli.py | 59 +++++++----------------- vendetta/example.py | 17 ------- vendetta/models.py | 3 -- vendetta/vendetta.py | 14 ++---- 17 files changed, 174 insertions(+), 102 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/data/config.yaml create mode 100644 tests/data/orders/orders.csv create mode 100644 tests/data/orders/orders.yaml create mode 100644 tests/data/people.csv create mode 100644 tests/parse_csv.py create mode 100644 tests/test_cli/__init__.py create mode 100644 tests/test_cli/test_cli.py delete mode 100644 tests/test_example/test_some_function.py create mode 100644 tests/test_vendetta/test_vendetta.py delete mode 100644 vendetta/example.py diff --git a/README.md b/README.md index d8d9259..ba25730 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,7 @@ [![Python Version](https://img.shields.io/pypi/pyversions/vendetta.svg)](https://pypi.org/project/vendetta/) [![wemake-python-styleguide](https://img.shields.io/badge/style-wemake-000000.svg)](https://github.com/wemake-services/wemake-python-styleguide) -Anonymize CSV file(s) - - -## Features - -- Fully typed with annotations and checked with mypy, [PEP561 compatible](https://www.python.org/dev/peps/pep-0561/) -- Add yours! - +Anonymize CSV file(s) by replacing sensitive values with fakes. ## Installation @@ -22,15 +15,51 @@ pip install vendetta ## Example -Showcase how your project can be used: -```python -from vendetta.example import some_function +Suppose you have `orders.csv` dataset with real customer names and order IDs. -print(some_function(3, 4)) -# => 7 +```csv +CustomerName,CustomerLastName,OrderID +Darth,Wader,1254 +Darth,Wader,1255 +,Yoda,1256 +Luke,Skywalker,1257 +Leia,Skywalker,1258 +,Yoda,1259 ``` +This list contains 4 unique customers. Let's create a configuration file, say, `orders.yaml`: + +```yaml +columns: + CustomerName: first_name + CustomerLastName: last_name +``` + +and run: + +```shell +vendetta orders.yaml orders.csv anon.csv +``` + +which gives something like this in `anon.csv`: + +```csv +CustomerName,CustomerLastName,OrderID +Elizabeth,Oliver,1254 +Elizabeth,Oliver,1255 +Karen,Rodriguez,1256 +Jonathan,Joseph,1257 +Katelyn,Joseph,1258 +Karen,Rodriguez,1259 +``` + +- OrderID column was not mentioned in the config, and was left as is +- Using [faker](https://faker.readthedocs.io/), program replaced the first and last names with random first and last names, making the data believable +- If in the source file two cells for the same column had the same value (Vader), the output file will also have identical values in these cells. + +Enjoy! + ## License [MIT](https://github.com/anatoly-scherbakov/vendetta/blob/master/LICENSE) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9dc3812 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope='package') +def test_data() -> Path: + """Test data directory.""" + return Path(__file__).parent / 'data' diff --git a/tests/data/config.yaml b/tests/data/config.yaml new file mode 100644 index 0000000..910290d --- /dev/null +++ b/tests/data/config.yaml @@ -0,0 +1,10 @@ +columns: + firstname: first_name + lastname: last_name + birthday: date + address: street_address + city: city + state: state + country: country + email: email + phone: phone_number diff --git a/tests/data/orders/orders.csv b/tests/data/orders/orders.csv new file mode 100644 index 0000000..84e6c75 --- /dev/null +++ b/tests/data/orders/orders.csv @@ -0,0 +1,7 @@ +CustomerName,CustomerLastName,OrderID +Darth,Wader,1254 +Darth,Wader,1255 +,Yoda,1256 +Luke,Skywalker,1257 +Leia,Skywalker,1258 +,Yoda,1259 diff --git a/tests/data/orders/orders.yaml b/tests/data/orders/orders.yaml new file mode 100644 index 0000000..f67ab3f --- /dev/null +++ b/tests/data/orders/orders.yaml @@ -0,0 +1,3 @@ +columns: + CustomerName: first_name + CustomerLastName: last_name diff --git a/tests/data/people.csv b/tests/data/people.csv new file mode 100644 index 0000000..2ae0c9c --- /dev/null +++ b/tests/data/people.csv @@ -0,0 +1,3 @@ +firstname,lastname,birthday,address,city,state,country,email,phone +Someone,LastName,,,city,state,country,x,y +SomeoneElse,LastName,,,city,state,country,y,x diff --git a/tests/parse_csv.py b/tests/parse_csv.py new file mode 100644 index 0000000..98faac1 --- /dev/null +++ b/tests/parse_csv.py @@ -0,0 +1,9 @@ +import csv +from typing import TextIO, List + +from vendetta.models import Row + + +def parse_csv(input_data: TextIO) -> List[Row]: + """Parse text CSV data for test purposes.""" + return list(csv.DictReader(input_data)) diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py new file mode 100644 index 0000000..eabf1f1 --- /dev/null +++ b/tests/test_cli/test_cli.py @@ -0,0 +1,31 @@ +import subprocess +import tempfile +from pathlib import Path + +from tests.parse_csv import parse_csv + + +def test_cli(test_data: Path): + """Run the project's CLI and test it.""" + with tempfile.TemporaryDirectory() as temp: + config = test_data / 'config.yaml' + source = test_data / 'people.csv' + destination = Path(temp) / 'output.csv' + + subprocess.call([ + 'vendetta', + str(config), + str(source), + str(destination) + ]) + + with destination.open() as test_result: + test_result_data = parse_csv(test_result) + + first, second = test_result_data + assert first['lastname'] == second['lastname'] + assert first['city'] == second['city'] + + assert first['email'] != second['email'] + assert first['phone'] != second['phone'] + assert first['firstname'] != second['firstname'] diff --git a/tests/test_example/test_some_function.py b/tests/test_example/test_some_function.py deleted file mode 100644 index b2f1c46..0000000 --- a/tests/test_example/test_some_function.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: utf-8 -*- - -import pytest - -from vendetta.example import some_function - - -@pytest.mark.parametrize(('first', 'second', 'expected'), [ - (1, 2, 3), - (2, 4, 6), - (-2, -3, -5), - (-5, 5, 0), -]) -def test_some_function(first, second, expected): - """Example test with parametrization.""" - assert some_function(first, second) == expected diff --git a/tests/test_vendetta/test_vendetta.py b/tests/test_vendetta/test_vendetta.py new file mode 100644 index 0000000..8a71e2d --- /dev/null +++ b/tests/test_vendetta/test_vendetta.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from io import StringIO + +from tests.parse_csv import parse_csv +from vendetta import Config, Vendetta + + +def test_vendetta_single(test_data: Path): + config = Config(columns={ + 'firstname': 'first_name', + 'lastname': 'last_name', + 'birthday': 'date', + 'address': 'street_address', + 'city': 'city', + 'state': 'state', + 'country': 'country', + 'email': 'email', + 'phone': 'phone_number', + }) + + v = Vendetta(config=config) + + output_file = StringIO() + with (test_data / 'people.csv').open('r') as input_file: + v(input_file, output_file) + + output_file.seek(0) + output_data = parse_csv(output_file) + + first, second = output_data + assert first['lastname'] == second['lastname'] + assert first['city'] == second['city'] + + assert first['email'] != second['email'] + assert first['phone'] != second['phone'] + assert first['firstname'] != second['firstname'] diff --git a/vendetta/__init__.py b/vendetta/__init__.py index 40a96af..06611ed 100644 --- a/vendetta/__init__.py +++ b/vendetta/__init__.py @@ -1 +1,2 @@ -# -*- coding: utf-8 -*- +from vendetta.models import Config, FakerConfig +from vendetta.vendetta import Vendetta diff --git a/vendetta/cli.py b/vendetta/cli.py index 90b8c2f..30074aa 100644 --- a/vendetta/cli.py +++ b/vendetta/cli.py @@ -1,63 +1,36 @@ -import csv from pathlib import Path +import strictyaml import typer -import yaml from vendetta.models import Config from vendetta.vendetta import Vendetta -try: # noqa - from yaml import CSafeDumper as SafeDumper # noqa - from yaml import CSafeLoader as SafeLoader # noqa -except ImportError: - from yaml import SafeDumper # type: ignore # noqa - from yaml import SafeLoader # type: ignore # noqa - - app = typer.Typer() -def read_config() -> Config: +def read_config(path: Path) -> Config: """Read configuration file.""" - with (Path(__file__).parent.parent / 'config.yaml').open() as config_file: - return Config(**yaml.load(config_file, Loader=SafeLoader)) + raw = path.read_text() + parsed = strictyaml.load(raw).data + return Config(**parsed) @app.command() -def cli() -> None: - """CLI.""" - config = read_config() - source_directory = Path( - '...', - ) - destination_directory = Path( - '...', - ) - +def cli( + config_file: Path, + source: Path, + destination: Path, +) -> None: + """Vendetta: anonymize CSV datasets.""" + config = read_config(config_file) vendetta = Vendetta(config=config) - for source_path in source_directory.rglob('*'): - destination_path = destination_directory / source_path.relative_to( - source_directory, + with source.open('r') as input_file, destination.open('w') as output_file: + vendetta( + input_file=input_file, + output_file=output_file, ) - with source_path.open() as source_file: - reader = csv.DictReader(source_file) - - if not destination_path.parent.exists(): - destination_path.parent.mkdir(parents=True, exist_ok=True) - - with destination_path.open('w+') as destination_file: - writer = csv.DictWriter( - destination_file, - fieldnames=reader.fieldnames, - ) - writer.writeheader() - - for row in reader: - writer.writerow( - vendetta.anonymize_row(row), - ) def main() -> None: diff --git a/vendetta/example.py b/vendetta/example.py deleted file mode 100644 index f8dc708..0000000 --- a/vendetta/example.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- - - -def some_function(first: int, second: int) -> int: - """ - We use this function as an example for some real logic. - - This is how you can write a doctest: - - .. code:: python - - >>> some_function(2, 3) - 5 - - Enjoy! - """ - return first + second diff --git a/vendetta/models.py b/vendetta/models.py index 143f0f1..f05ca63 100644 --- a/vendetta/models.py +++ b/vendetta/models.py @@ -10,9 +10,6 @@ # Generate a random value cached based on original value ResponsibleFake = Callable[[str], str] -# Function which changes the row of the source data. -RowUpdater = Callable[[Row], None] - class FakerConfig(BaseModel): """Faker configuration.""" diff --git a/vendetta/vendetta.py b/vendetta/vendetta.py index bb5d029..4ce37e4 100644 --- a/vendetta/vendetta.py +++ b/vendetta/vendetta.py @@ -1,16 +1,10 @@ import csv from functools import lru_cache -from typing import Callable, Dict, TextIO +from typing import Dict, TextIO from faker import Faker -from vendetta.models import Config, Row, ResponsibleFake, NaiveFake - -faker = Faker() - - -def cached_faker(fake: Callable[[], str]) -> Callable[[str], str]: - return lru_cache()(lambda _: fake()) +from vendetta.models import Config, ResponsibleFake, NaiveFake class Vendetta: @@ -52,7 +46,7 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None: fake_per_column = { column_name: self.get_fake_by_name(fake_name) - for column_name, fake_name in self.config.columns + for column_name, fake_name in self.config.columns.items() if column_name in set(columns) } @@ -61,3 +55,5 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None: row[column_name] = fake(row[column_name]) writer.writerow(row) + + __call__ = anonymize_file