Skip to content

Commit

Permalink
Implement draft functionality and some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
anatoly-scherbakov committed Jan 14, 2021
1 parent 16f270c commit 26b3884
Show file tree
Hide file tree
Showing 17 changed files with 174 additions and 102 deletions.
55 changes: 42 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,7 @@
[![Python Version](https://img.shields.io/pypi/pyversions/vendetta.svg)](https://pypi.org/project/vendetta/)
[![wemake-python-styleguide](https://img.shields.io/badge/style-wemake-000000.svg)](https://github.com/wemake-services/wemake-python-styleguide)

Anonymize CSV file(s)


## Features

- Fully typed with annotations and checked with mypy, [PEP561 compatible](https://www.python.org/dev/peps/pep-0561/)
- Add yours!

Anonymize CSV file(s) by replacing sensitive values with fakes.

## Installation

Expand All @@ -22,15 +15,51 @@ pip install vendetta

## Example

Showcase how your project can be used:

```python
from vendetta.example import some_function
Suppose you have `orders.csv` dataset with real customer names and order IDs.

print(some_function(3, 4))
# => 7
```csv
CustomerName,CustomerLastName,OrderID
Darth,Wader,1254
Darth,Wader,1255
,Yoda,1256
Luke,Skywalker,1257
Leia,Skywalker,1258
,Yoda,1259
```

This list contains 4 unique customers. Let's create a configuration file, say, `orders.yaml`:

```yaml
columns:
CustomerName: first_name
CustomerLastName: last_name
```
and run:
```shell
vendetta orders.yaml orders.csv anon.csv
```

which gives something like this in `anon.csv`:

```csv
CustomerName,CustomerLastName,OrderID
Elizabeth,Oliver,1254
Elizabeth,Oliver,1255
Karen,Rodriguez,1256
Jonathan,Joseph,1257
Katelyn,Joseph,1258
Karen,Rodriguez,1259
```

- OrderID column was not mentioned in the config, and was left as is
- Using [faker](https://faker.readthedocs.io/), program replaced the first and last names with random first and last names, making the data believable
- If in the source file two cells for the same column had the same value (Vader), the output file will also have identical values in these cells.

Enjoy!

## License

[MIT](https://github.com/anatoly-scherbakov/vendetta/blob/master/LICENSE)
Expand Down
Empty file added tests/__init__.py
Empty file.
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path

import pytest


@pytest.fixture(scope='package')
def test_data() -> Path:
"""Test data directory."""
return Path(__file__).parent / 'data'
10 changes: 10 additions & 0 deletions tests/data/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
columns:
firstname: first_name
lastname: last_name
birthday: date
address: street_address
city: city
state: state
country: country
email: email
phone: phone_number
7 changes: 7 additions & 0 deletions tests/data/orders/orders.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CustomerName,CustomerLastName,OrderID
Darth,Wader,1254
Darth,Wader,1255
,Yoda,1256
Luke,Skywalker,1257
Leia,Skywalker,1258
,Yoda,1259
3 changes: 3 additions & 0 deletions tests/data/orders/orders.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
columns:
CustomerName: first_name
CustomerLastName: last_name
3 changes: 3 additions & 0 deletions tests/data/people.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
firstname,lastname,birthday,address,city,state,country,email,phone
Someone,LastName,,,city,state,country,x,y
SomeoneElse,LastName,,,city,state,country,y,x
9 changes: 9 additions & 0 deletions tests/parse_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import csv
from typing import TextIO, List

from vendetta.models import Row


def parse_csv(input_data: TextIO) -> List[Row]:
"""Parse text CSV data for test purposes."""
return list(csv.DictReader(input_data))
Empty file added tests/test_cli/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions tests/test_cli/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import subprocess
import tempfile
from pathlib import Path

from tests.parse_csv import parse_csv


def test_cli(test_data: Path):
"""Run the project's CLI and test it."""
with tempfile.TemporaryDirectory() as temp:
config = test_data / 'config.yaml'
source = test_data / 'people.csv'
destination = Path(temp) / 'output.csv'

subprocess.call([
'vendetta',
str(config),
str(source),
str(destination)
])

with destination.open() as test_result:
test_result_data = parse_csv(test_result)

first, second = test_result_data
assert first['lastname'] == second['lastname']
assert first['city'] == second['city']

assert first['email'] != second['email']
assert first['phone'] != second['phone']
assert first['firstname'] != second['firstname']
16 changes: 0 additions & 16 deletions tests/test_example/test_some_function.py

This file was deleted.

37 changes: 37 additions & 0 deletions tests/test_vendetta/test_vendetta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import Path

from io import StringIO

from tests.parse_csv import parse_csv
from vendetta import Config, Vendetta


def test_vendetta_single(test_data: Path):
config = Config(columns={
'firstname': 'first_name',
'lastname': 'last_name',
'birthday': 'date',
'address': 'street_address',
'city': 'city',
'state': 'state',
'country': 'country',
'email': 'email',
'phone': 'phone_number',
})

v = Vendetta(config=config)

output_file = StringIO()
with (test_data / 'people.csv').open('r') as input_file:
v(input_file, output_file)

output_file.seek(0)
output_data = parse_csv(output_file)

first, second = output_data
assert first['lastname'] == second['lastname']
assert first['city'] == second['city']

assert first['email'] != second['email']
assert first['phone'] != second['phone']
assert first['firstname'] != second['firstname']
3 changes: 2 additions & 1 deletion vendetta/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# -*- coding: utf-8 -*-
from vendetta.models import Config, FakerConfig
from vendetta.vendetta import Vendetta
59 changes: 16 additions & 43 deletions vendetta/cli.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,36 @@
import csv
from pathlib import Path

import strictyaml
import typer
import yaml

from vendetta.models import Config
from vendetta.vendetta import Vendetta

try: # noqa
from yaml import CSafeDumper as SafeDumper # noqa
from yaml import CSafeLoader as SafeLoader # noqa
except ImportError:
from yaml import SafeDumper # type: ignore # noqa
from yaml import SafeLoader # type: ignore # noqa


app = typer.Typer()


def read_config() -> Config:
def read_config(path: Path) -> Config:
"""Read configuration file."""
with (Path(__file__).parent.parent / 'config.yaml').open() as config_file:
return Config(**yaml.load(config_file, Loader=SafeLoader))
raw = path.read_text()
parsed = strictyaml.load(raw).data
return Config(**parsed)


@app.command()
def cli() -> None:
"""CLI."""
config = read_config()
source_directory = Path(
'...',
)
destination_directory = Path(
'...',
)

def cli(
config_file: Path,
source: Path,
destination: Path,
) -> None:
"""Vendetta: anonymize CSV datasets."""
config = read_config(config_file)
vendetta = Vendetta(config=config)

for source_path in source_directory.rglob('*'):
destination_path = destination_directory / source_path.relative_to(
source_directory,
with source.open('r') as input_file, destination.open('w') as output_file:
vendetta(
input_file=input_file,
output_file=output_file,
)
with source_path.open() as source_file:
reader = csv.DictReader(source_file)

if not destination_path.parent.exists():
destination_path.parent.mkdir(parents=True, exist_ok=True)

with destination_path.open('w+') as destination_file:
writer = csv.DictWriter(
destination_file,
fieldnames=reader.fieldnames,
)
writer.writeheader()

for row in reader:
writer.writerow(
vendetta.anonymize_row(row),
)


def main() -> None:
Expand Down
17 changes: 0 additions & 17 deletions vendetta/example.py

This file was deleted.

3 changes: 0 additions & 3 deletions vendetta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
# Generate a random value cached based on original value
ResponsibleFake = Callable[[str], str]

# Function which changes the row of the source data.
RowUpdater = Callable[[Row], None]


class FakerConfig(BaseModel):
"""Faker configuration."""
Expand Down
14 changes: 5 additions & 9 deletions vendetta/vendetta.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
import csv
from functools import lru_cache
from typing import Callable, Dict, TextIO
from typing import Dict, TextIO

from faker import Faker

from vendetta.models import Config, Row, ResponsibleFake, NaiveFake

faker = Faker()


def cached_faker(fake: Callable[[], str]) -> Callable[[str], str]:
return lru_cache()(lambda _: fake())
from vendetta.models import Config, ResponsibleFake, NaiveFake


class Vendetta:
Expand Down Expand Up @@ -52,7 +46,7 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None:

fake_per_column = {
column_name: self.get_fake_by_name(fake_name)
for column_name, fake_name in self.config.columns
for column_name, fake_name in self.config.columns.items()
if column_name in set(columns)
}

Expand All @@ -61,3 +55,5 @@ def anonymize_file(self, input_file: TextIO, output_file: TextIO) -> None:
row[column_name] = fake(row[column_name])

writer.writerow(row)

__call__ = anonymize_file

0 comments on commit 26b3884

Please sign in to comment.