Implement draft functionality and some tests

anatoly-scherbakov · Jan 14, 2021 · 26b3884 · 26b3884
1 parent 16f270c
commit 26b3884
Show file tree

Hide file tree

Showing 17 changed files with 174 additions and 102 deletions.
diff --git a/README.md b/README.md
@@ -4,14 +4,7 @@
 [![Python Version](https://img.shields.io/pypi/pyversions/vendetta.svg)](https://pypi.org/project/vendetta/)
 [![wemake-python-styleguide](https://img.shields.io/badge/style-wemake-000000.svg)](https://github.com/wemake-services/wemake-python-styleguide)
 
-Anonymize CSV file(s)
-
-
-## Features
-
-- Fully typed with annotations and checked with mypy, [PEP561 compatible](https://www.python.org/dev/peps/pep-0561/)
-- Add yours!
-
+Anonymize CSV file(s) by replacing sensitive values with fakes.
 
 ## Installation
 
@@ -22,15 +15,51 @@ pip install vendetta
 
 ## Example
 
-Showcase how your project can be used:
 
-```python
-from vendetta.example import some_function
+Suppose you have `orders.csv` dataset with real customer names and order IDs.
 
-print(some_function(3, 4))
-# => 7
+```csv
+CustomerName,CustomerLastName,OrderID
+Darth,Wader,1254
+Darth,Wader,1255
+,Yoda,1256
+Luke,Skywalker,1257
+Leia,Skywalker,1258
+,Yoda,1259
 ```
 
+This list contains 4 unique customers. Let's create a configuration file, say, `orders.yaml`:
+
+```yaml
+columns:
+  CustomerName: first_name
+  CustomerLastName: last_name
+```
+
+and run:
+
+```shell
+vendetta orders.yaml orders.csv anon.csv
+```
+
+which gives something like this in `anon.csv`:
+
+```csv
+CustomerName,CustomerLastName,OrderID
+Elizabeth,Oliver,1254
+Elizabeth,Oliver,1255
+Karen,Rodriguez,1256
+Jonathan,Joseph,1257
+Katelyn,Joseph,1258
+Karen,Rodriguez,1259
+```
+
+- OrderID column was not mentioned in the config, and was left as is
+- Using [faker](https://faker.readthedocs.io/), program replaced the first and last names with random first and last names, making the data believable
+- If in the source file two cells for the same column had the same value (Vader), the output file will also have identical values in these cells.
+
+Enjoy!
+
 ## License
 
 [MIT](https://github.com/anatoly-scherbakov/vendetta/blob/master/LICENSE)

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope='package')
+def test_data() -> Path:
+    """Test data directory."""
+    return Path(__file__).parent / 'data'
diff --git a/tests/data/config.yaml b/tests/data/config.yaml
@@ -0,0 +1,10 @@
+columns:
+  firstname: first_name
+  lastname: last_name
+  birthday: date
+  address: street_address
+  city: city
+  state: state
+  country: country
+  email: email
+  phone: phone_number
diff --git a/tests/data/orders/orders.csv b/tests/data/orders/orders.csv
@@ -0,0 +1,7 @@
+CustomerName,CustomerLastName,OrderID
+Darth,Wader,1254
+Darth,Wader,1255
+,Yoda,1256
+Luke,Skywalker,1257
+Leia,Skywalker,1258
+,Yoda,1259
diff --git a/tests/data/orders/orders.yaml b/tests/data/orders/orders.yaml
@@ -0,0 +1,3 @@
+columns:
+  CustomerName: first_name
+  CustomerLastName: last_name
diff --git a/tests/data/people.csv b/tests/data/people.csv
@@ -0,0 +1,3 @@
+firstname,lastname,birthday,address,city,state,country,email,phone
+Someone,LastName,,,city,state,country,x,y
+SomeoneElse,LastName,,,city,state,country,y,x
diff --git a/tests/parse_csv.py b/tests/parse_csv.py
@@ -0,0 +1,9 @@
+import csv
+from typing import TextIO, List
+
+from vendetta.models import Row
+
+
+def parse_csv(input_data: TextIO) -> List[Row]:
+    """Parse text CSV data for test purposes."""
+    return list(csv.DictReader(input_data))
diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py
diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py
@@ -0,0 +1,31 @@
+import subprocess
+import tempfile
+from pathlib import Path
+
+from tests.parse_csv import parse_csv
+
+
+def test_cli(test_data: Path):
+    """Run the project's CLI and test it."""
+    with tempfile.TemporaryDirectory() as temp:
+        config = test_data / 'config.yaml'
+        source = test_data / 'people.csv'
+        destination = Path(temp) / 'output.csv'
+
+        subprocess.call([
+            'vendetta',
+            str(config),
+            str(source),
+            str(destination)
+        ])
+
+        with destination.open() as test_result:
+            test_result_data = parse_csv(test_result)
+
+    first, second = test_result_data
+    assert first['lastname'] == second['lastname']
+    assert first['city'] == second['city']
+
+    assert first['email'] != second['email']
+    assert first['phone'] != second['phone']
+    assert first['firstname'] != second['firstname']
diff --git a/tests/test_example/test_some_function.py b/tests/test_example/test_some_function.py
diff --git a/tests/test_vendetta/test_vendetta.py b/tests/test_vendetta/test_vendetta.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+from io import StringIO
+
+from tests.parse_csv import parse_csv
+from vendetta import Config, Vendetta
+
+
+def test_vendetta_single(test_data: Path):
+    config = Config(columns={
+        'firstname': 'first_name',
+        'lastname': 'last_name',
+        'birthday': 'date',
+        'address': 'street_address',
+        'city': 'city',
+        'state': 'state',
+        'country': 'country',
+        'email': 'email',
+        'phone': 'phone_number',
+    })
+
+    v = Vendetta(config=config)
+
+    output_file = StringIO()
+    with (test_data / 'people.csv').open('r') as input_file:
+        v(input_file, output_file)
+
+    output_file.seek(0)
+    output_data = parse_csv(output_file)
+
+    first, second = output_data
+    assert first['lastname'] == second['lastname']
+    assert first['city'] == second['city']
+
+    assert first['email'] != second['email']
+    assert first['phone'] != second['phone']
+    assert first['firstname'] != second['firstname']
diff --git a/vendetta/__init__.py b/vendetta/__init__.py
@@ -1 +1,2 @@
-# -*- coding: utf-8 -*-
+from vendetta.models import Config, FakerConfig
+from vendetta.vendetta import Vendetta
diff --git a/vendetta/cli.py b/vendetta/cli.py
@@ -1,63 +1,36 @@
-import csv
 from pathlib import Path
 
+import strictyaml
 import typer
-import yaml
 
 from vendetta.models import Config
 from vendetta.vendetta import Vendetta
 
-try:  # noqa
-    from yaml import CSafeDumper as SafeDumper  # noqa
-    from yaml import CSafeLoader as SafeLoader  # noqa
-except ImportError:
-    from yaml import SafeDumper  # type: ignore   # noqa
-    from yaml import SafeLoader  # type: ignore   # noqa
-
-
 app = typer.Typer()
 
 
-def read_config() -> Config:
+def read_config(path: Path) -> Config:
     """Read configuration file."""
-    with (Path(__file__).parent.parent / 'config.yaml').open() as config_file:
-        return Config(**yaml.load(config_file, Loader=SafeLoader))
+    raw = path.read_text()
+    parsed = strictyaml.load(raw).data
+    return Config(**parsed)
 
 
 @app.command()
-def cli() -> None:
-    """CLI."""
-    config = read_config()
-    source_directory = Path(
-        '...',
-    )
-    destination_directory = Path(
-        '...',
-    )
-
+def cli(
+    config_file: Path,
+    source: Path,
+    destination: Path,
+) -> None:
+    """Vendetta: anonymize CSV datasets."""
+    config = read_config(config_file)
     vendetta = Vendetta(config=config)
 
-    for source_path in source_directory.rglob('*'):
-        destination_path = destination_directory / source_path.relative_to(
-            source_directory,
+    with source.open('r') as input_file, destination.open('w') as output_file:
+        vendetta(
+            input_file=input_file,
+            output_file=output_file,
         )
-        with source_path.open() as source_file:
-            reader = csv.DictReader(source_file)
-
-            if not destination_path.parent.exists():
-                destination_path.parent.mkdir(parents=True, exist_ok=True)
-
-            with destination_path.open('w+') as destination_file:
-                writer = csv.DictWriter(
-                    destination_file,
-                    fieldnames=reader.fieldnames,
-                )
-                writer.writeheader()
-
-                for row in reader:
-                    writer.writerow(
-                        vendetta.anonymize_row(row),
-                    )
 
 
 def main() -> None:

diff --git a/vendetta/example.py b/vendetta/example.py
diff --git a/vendetta/models.py b/vendetta/models.py
@@ -10,9 +10,6 @@
 # Generate a random value cached based on original value
 ResponsibleFake = Callable[[str], str]
 
-# Function which changes the row of the source data.
-RowUpdater = Callable[[Row], None]
-
 
 class FakerConfig(BaseModel):
     """Faker configuration."""

diff --git a/vendetta/vendetta.py b/vendetta/vendetta.py
@@ -1,16 +1,10 @@
 import csv
 from functools import lru_cache
-from typing import Callable, Dict, TextIO
+from typing import Dict, TextIO
 
 from faker import Faker
 
-from vendetta.models import Config, Row, ResponsibleFake, NaiveFake
-
-faker = Faker()
-
-
-def cached_faker(fake: Callable[[], str]) -> Callable[[str], str]:
-    return lru_cache()(lambda _: fake())
+from vendetta.models import Config, ResponsibleFake, NaiveFake
 
 
 class Vendetta:
@@ -52,7 +46,7 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None:
 
         fake_per_column = {
             column_name: self.get_fake_by_name(fake_name)
-            for column_name, fake_name in self.config.columns
+            for column_name, fake_name in self.config.columns.items()
             if column_name in set(columns)
         }
 
@@ -61,3 +55,5 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None:
                 row[column_name] = fake(row[column_name])
 
             writer.writerow(row)
+
+    __call__ = anonymize_file