diff --git a/README.md b/README.md index 42c794a..1476a6a 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,29 @@ Optionally, specify a [CSV Dialect](https://frictionlessdata.io/specs/csv-dialec $table = new Table("tests/fixtures/data.csv", null, ["delimiter" => ";"]); ``` +Table::read method allows to get all data as an array, it also supports options to modify reader behavior + +```php +$table->read() // returns all the data as an array +``` + +read accepts an options parameter, for example: + +```php +$table->read(["cast" => false, "limit": 5]) +``` + +The following options are available (the values are the default values): + +```php +$table->read([ + "keyed" => true, // flag to emit keyed rows + "extended" => false, // flag to emit extended rows + "cast" => true, //flag to disable data casting if false + "limit" => null, // integer limit of rows to return +]); +``` + Additional methods and functionality ```php diff --git a/src/CsvDialect.php b/src/CsvDialect.php index 4ce3ac5..e6958f5 100644 --- a/src/CsvDialect.php +++ b/src/CsvDialect.php @@ -82,7 +82,21 @@ public function __construct($dialect = null) } } - public function parseRow($line) + /** + * Parses the csv row according to the csv dialect. + * + * Returns an array of fields parsed form the line + * + * In case of line termination inside an enclosed field, the last field will contain a ContinueEnclosedField object + * + * @param $line string + * + * @return array + * + * @throws DataSourceException + * @throws \Exception + */ + public function parseRow($line, $continueLine = null) { // RFC4180 - Each record is located on a separate line, delimited by a line break (CRLF) // Tabular Data - The line terminator character MUST be LF or CRLF @@ -102,6 +116,15 @@ public function parseRow($line) $fields = []; $field = -1; $lastCharPos = mb_strlen($line) - 1; + if ($continueLine) { + if (!is_a($continueLine[count($continueLine) - 1], 'frictionlessdata\\tableschema\\ContinueEnclosedField')) { + throw new \Exception('invalid continueLine'); + } + unset($continueLine[count($continueLine) - 1]); + $fields = $continueLine; + $field = count($fields) - 1; + $enclosed = true; + } for ($charPos = 0; $charPos < mb_strlen($line); ++$charPos) { $char = mb_substr($line, $charPos, 1); if ($enclosed === null) { @@ -116,50 +139,40 @@ public function parseRow($line) ++$field; $fields[$field] = ''; } - continue; } else { ++$field; $fields[$field] = ''; if ($char == $this->dialect['quoteChar']) { $enclosed = true; - continue; } else { $enclosed = false; $fields[$field] .= $char; - continue; } } } elseif ($enclosed) { // processing an enclosed field - if ($this->dialect['doubleQuote'] !== null && $char == $this->dialect['quoteChar']) { - // encountered quote in doubleQuote mode - if ($charPos !== 0 && mb_substr($line, $charPos - 1, 1) == $this->dialect['quoteChar']) { - // previous char was also a double quote - // the quote was added in previous iteration, nothing to do here - continue; - } elseif ($charPos != $lastCharPos && mb_substr($line, $charPos + 1, 1) == $this->dialect['quoteChar']) { - // next char is a also a double quote - add a quote to the field - $fields[$field] .= $this->dialect['quoteChar']; - continue; - } - } - if ($this->dialect['escapeChar']) { - // handle escape chars - if ($char == $this->dialect['escapeChar']) { - // char is the escape char, add the escaped char to the string - if ($charPos === $lastCharPos) { - throw new DataSourceException('Encountered escape char at end of line'); - } else { - $fields[$field] .= mb_substr($line, $charPos + 1, 1); - } - continue; - } elseif ($charPos != 0 && mb_substr($line, $charPos - 1, 1) == $this->dialect['escapeChar']) { - // previous char was the escape string - // added the char in previous iteration, nothing to do here - continue; + if ( + $this->dialect['doubleQuote'] !== null && $char == $this->dialect['quoteChar'] + && $charPos != $lastCharPos && mb_substr($line, $charPos + 1, 1) == $this->dialect['quoteChar'] + ) { + // doubleQuote mode is active, current char is a quote and next char is a quote + $fields[$field] .= $this->dialect['quoteChar']; + // skip a char + ++$charPos; + continue; + } elseif ( + $this->dialect['escapeChar'] && $char === $this->dialect['escapeChar'] + ) { + // encountered escape char, add the escaped char to the string + if ($charPos === $lastCharPos) { + throw new DataSourceException('Encountered escape char at end of line'); + } else { + $fields[$field] .= mb_substr($line, $charPos + 1, 1); } - } - if ($char == $this->dialect['quoteChar']) { + // skip a char + ++$charPos; + continue; + } elseif ($char == $this->dialect['quoteChar']) { // encountered a quote signifying the end of the enclosed field $enclosed = null; continue; @@ -193,15 +206,19 @@ public function parseRow($line) } } } - if (count($fields) > 1 && mb_strlen($fields[count($fields) - 1]) == 0) { - throw new \Exception('Invalid csv file - line must not end with a comma'); - } if ($this->dialect['skipInitialSpace']) { - return array_map(function ($field) { + $fields = array_map(function ($field) { return ltrim($field); }, $fields); - } else { - return $fields; } + if ($enclosed === true && !is_a($fields[count($fields) - 1], 'frictionlessdata\\tableschema\\ContinueEnclosedField')) { + $fields[$field + 1] = new ContinueEnclosedField(); + } + + return $fields; } } + +class ContinueEnclosedField +{ +} diff --git a/src/DataSources/CsvDataSource.php b/src/DataSources/CsvDataSource.php index 69f82cf..584e388 100644 --- a/src/DataSources/CsvDataSource.php +++ b/src/DataSources/CsvDataSource.php @@ -90,8 +90,12 @@ public function getNextLine() $this->nextRow = null; $colNum = 0; $obj = []; + if (count($row) != count($this->headerRow)) { + throw new DataSourceException('Invalid row: '.implode(', ', $row)); + } foreach ($this->headerRow as $fieldName) { - $obj[$fieldName] = $row[$colNum++]; + $obj[$fieldName] = $row[$colNum]; + ++$colNum; } return $obj; @@ -168,7 +172,7 @@ public function save($outputDataSource) * * @throws DataSourceException */ - protected function getRow() + protected function getRow($continueRow = null) { ++$this->curRowNum; try { @@ -177,6 +181,11 @@ protected function getRow() throw new DataSourceException($e->getMessage(), $this->curRowNum); } - return $this->csvDialect->parseRow($line); + $row = $this->csvDialect->parseRow($line, $continueRow); + if (count($row) > 0 && is_a($row[count($row) - 1], 'frictionlessdata\\tableschema\\ContinueEnclosedField')) { + return $this->getRow($row); + } else { + return $row; + } } } diff --git a/src/Table.php b/src/Table.php index 2754f86..ddc50be 100644 --- a/src/Table.php +++ b/src/Table.php @@ -102,11 +102,45 @@ public function headers($numPeekRows = 10) return array_keys($this->schema->fields()); } - public function read() + public function read($options = null) { + $options = array_merge([ + 'keyed' => true, + 'extended' => false, + 'cast' => true, + 'limit' => null, + ], $options ? $options : []); $rows = []; - foreach ($this as $row) { - $rows[] = $row; + $rowNum = 0; + if ($options['extended']) { + $headers = $this->headers($options['limit'] ? $options['limit'] : null); + } + if (!$options['cast']) { + $this->dataSource->open(); + while (!$this->dataSource->isEof()) { + $row = $this->dataSource->getNextLine(); + if ($options['extended']) { + $rows[] = [$rowNum, $headers, array_values($row)]; + } else { + $rows[] = $row; + } + if ($options['limit'] && $options['limit'] > 0 && $rowNum + 1 >= $options['limit']) { + break; + } + ++$rowNum; + } + } else { + foreach ($this as $row) { + if ($options['extended']) { + $rows[] = [$rowNum, $headers, array_values($row)]; + } else { + $rows[] = $row; + } + if ($options['limit'] && $options['limit'] > 0 && $rowNum + 1 >= $options['limit']) { + break; + } + ++$rowNum; + } } return $rows; diff --git a/tests/TableTest.php b/tests/TableTest.php index bc1f288..da12521 100644 --- a/tests/TableTest.php +++ b/tests/TableTest.php @@ -234,17 +234,26 @@ public function testCsvDialectLolsv() ], $rows); } + public function testCsvLineBreak() + { + $table = new Table($this->fixture('data_linebreaks.csv')); + $this->assertEquals([ + ['aaa' => 'test a', 'bbb' => 'test b', 'ccc' => 'test c'], + ], $table->read()); + } + public function testCsvDialectDatapackagePipelines() { $datapackage = json_decode(file_get_contents($this->fixture('committees/datapackage.json'))); $resource = $datapackage->resources[0]; $table = new Table($this->fixture('committees/kns_committee.csv'), $resource->schema, $resource->dialect); $rows = []; + $rowNum = 0; foreach ($table as $row) { - $rows[] = $row; - if (count($rows) == 2) { - break; + if (in_array($rowNum, [0, 1, 132])) { + $rows[] = $row; } + ++$rowNum; } $this->assertEquals([[ 'CommitteeID' => 97, @@ -280,9 +289,60 @@ public function testCsvDialectDatapackagePipelines() 'CommitteeParentName' => null, 'IsCurrent' => true, 'LastUpdatedDate' => Carbon::create(2015, 3, 20, 12, 2, 57), + ], [ + 'CommitteeID' => 679, + 'Name' => 'משותפת לכלכלה וחינוך לדיון בחוק הרשות השניה לטלויזיה ורדיו התש"ן-1990', + 'CategoryID' => 317, + 'CategoryDesc' => 'ועדה משותפת לכלכלה וחינוך לדיון בחוק הרשות השניה לטלוויזיה ורדיו, התש"ן-1990', + 'KnessetNum' => 18, + 'CommitteeTypeID' => 73, + 'CommitteeTypeDesc' => 'ועדה משותפת', + 'Email' => 'vkalkala@knesset.gov.il', + 'StartDate' => Carbon::create(2009, 6, 30, 0, 0, 0), + 'FinishDate' => null, + 'AdditionalTypeID' => 991, + 'AdditionalTypeDesc' => 'קבועה', + 'ParentCommitteeID' => null, + 'CommitteeParentName' => null, + 'IsCurrent' => true, + 'LastUpdatedDate' => Carbon::create(2015, 3, 20, 12, 2, 57), ]], $rows); } + public function testReadOptions() + { + $datapackage = json_decode(file_get_contents($this->fixture('committees/datapackage.json'))); + $resource = $datapackage->resources[0]; + $table = new Table($this->fixture('committees/kns_committee.csv'), $resource->schema, $resource->dialect); + $this->assertEquals([ + [ + 0, + [ + 'CommitteeID', 'Name', 'CategoryID', 'CategoryDesc', 'KnessetNum', 'CommitteeTypeID', + 'CommitteeTypeDesc', 'Email', 'StartDate', 'FinishDate', 'AdditionalTypeID', + 'AdditionalTypeDesc', 'ParentCommitteeID', 'CommitteeParentName', 'IsCurrent', 'LastUpdatedDate', + ], [ + '97', 'ה"ח המדיניות הכלכלית לשנת הכספים 2004', '', '', '16', '73', 'ועדה משותפת', '', + '2004-08-12 00:00:00', '', '', + '', '', '', 'True', '2015-03-20 12:02:57', + ], + ], + [ + 1, + [ + 'CommitteeID', 'Name', 'CategoryID', 'CategoryDesc', 'KnessetNum', 'CommitteeTypeID', + 'CommitteeTypeDesc', 'Email', 'StartDate', 'FinishDate', 'AdditionalTypeID', + 'AdditionalTypeDesc', 'ParentCommitteeID', 'CommitteeParentName', 'IsCurrent', 'LastUpdatedDate', + ], + [ + '314', 'המיוחדת לענין לקחי אסון גשר המכביה', '', '', '14', '72', 'ועדה מיוחדת', '', + '1988-10-19 00:00:00', '', '992', + 'מיוחדת', '', '', 'True', '2015-03-20 12:02:57', + ], + ], + ], $table->read(['keyed' => false, 'extended' => true, 'cast' => false, 'limit' => 2])); + } + protected $fixturesPath; protected $validSchema; diff --git a/tests/fixtures/data_linebreaks.csv b/tests/fixtures/data_linebreaks.csv new file mode 100644 index 0000000..fba5681 --- /dev/null +++ b/tests/fixtures/data_linebreaks.csv @@ -0,0 +1,4 @@ +aaa,bbb,ccc +"test a +","test b","test c +"