Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(CSV): allow iterating from an SplfileObject #55

Merged
merged 1 commit into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 66 additions & 16 deletions src/Iterator/CSVIterator.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use BenTools\ETL\Normalizer\NumericStringToNumberNormalizer;
use BenTools\ETL\Normalizer\ValueNormalizerInterface;
use IteratorAggregate;
use SplFileObject;
use Symfony\Component\OptionsResolver\OptionsResolver;
use Traversable;

Expand Down Expand Up @@ -61,39 +62,88 @@ public function __construct(
$this->options = $resolver->resolve($options);
}

/**
* @param array<int|string, mixed> $data
* @param list<string>|null $columns
*
* @return array|string[]
*/
private function extract(array $data, ?array $columns): array
{
if ($this->options['normalizers']) {
array_walk($data, function (&$value) {
foreach ($this->options['normalizers'] as $normalizer) {
$value = $normalizer->normalize($value);
}

return $value;
});
}

return !empty($columns) ? self::combine($columns, $data) : $data;
}

public function getIterator(): Traversable
{
if ($this->text instanceof SplFileObject) {
return $this->iterateFromFile($this->text);
}

return $this->iterateFromContent($this->text);
}

/**
* @return Traversable<mixed>
*/
private function iterateFromFile(SplFileObject $file): Traversable
{
$flags = [SplFileObject::READ_CSV, $file->getFlags()];
$file->setFlags(array_reduce($flags, fn ($a, $b) => $a | $b, 0));
$columns = $this->options['columns'];
if ('auto' === $columns) {
$columns = null;
}
foreach ($this->text as $r => $row) {
$fields = str_getcsv(
$row,
while (!$file->eof()) {
$fields = $file->fgetcsv(
$this->options['delimiter'],
$this->options['enclosure'],
$this->options['escapeString'],
);
if (0 === $r && 'auto' === $this->options['columns']) {
if ([null] === $fields) {
continue;
}
if ('auto' === $this->options['columns'] && 0 === $file->key()) {
$columns ??= $fields;
continue;
}

if ($this->options['normalizers']) {
array_walk($fields, function (&$value) {
foreach ($this->options['normalizers'] as $normalizer) {
$value = $normalizer->normalize($value);
}

return $value;
});
}
yield $this->extract($fields, $columns);
}
}

if (!empty($columns)) {
yield self::combine($columns, $fields);
/**
* @param Traversable<string> $content
*
* @return Traversable<mixed>
*/
private function iterateFromContent(Traversable $content): Traversable
{
$columns = $this->options['columns'];
if ('auto' === $columns) {
$columns = null;
}
foreach ($content as $r => $row) {
$fields = str_getcsv(
$row,
$this->options['delimiter'],
$this->options['enclosure'],
$this->options['escapeString'],
);
if ('auto' === $this->options['columns'] && 0 === $r) {
$columns ??= $fields;
continue;
}
yield $fields;
yield $this->extract($fields, $columns);
}
}

Expand Down
97 changes: 52 additions & 45 deletions tests/Unit/Iterator/CSVIteratorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

use BenTools\ETL\Iterator\CSVIterator;
use BenTools\ETL\Iterator\StrTokIterator;
use SplFileObject;

use function dirname;
use function expect;
use function Safe\file_get_contents;

it('iterates over CSV data', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [...new CSVIterator(new StrTokIterator($content))];
it('iterates over CSV data', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows)->toHaveCount(11)
->and($rows[0])->toBe([
Expand All @@ -30,11 +30,14 @@
3 => 'Asia',
4 => 13929286,
]);
})->with(function () {
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)));
yield 'file' => new CSVIterator(new SplFileObject($filename));
});

it('can make columns automatically', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [...new CSVIterator(new StrTokIterator($content), ['columns' => 'auto'])];
it('can make columns automatically', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows)->toHaveCount(10)
->and($rows[0])->toBe([
Expand All @@ -51,21 +54,14 @@
'continent' => 'Asia',
'population' => 13929286,
]);
})->with(function () {
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => 'auto']);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => 'auto']);
});

it('can map user-defined columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
],
]),
];
it('can map user-defined columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -81,22 +77,21 @@
'continent' => 'Asia',
'population' => 13929286,
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});

it('adds fields when the row has not enough columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
'misc',
],
]),
];
it('adds fields when the row has not enough columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -114,20 +109,22 @@
'population' => 13929286,
'misc' => null,
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
'population',
'misc',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});

it('removes extra data whenever there are more fields than columns', function () {
$content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv');
$rows = [
...new CSVIterator(new StrTokIterator($content), [
'columns' => [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
],
]),
];
it('removes extra data whenever there are more fields than columns', function (CSVIterator $iterator) {
$rows = [...$iterator];

expect($rows[1])->toBe([
'cityEnglishName' => 'New York',
Expand All @@ -141,4 +138,14 @@
'countryIsoCode' => 'JP',
'continent' => 'Asia',
]);
})->with(function () {
$columns = [
'cityEnglishName',
'cityLocalName',
'countryIsoCode',
'continent',
];
$filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv';
yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]);
yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]);
});
Loading