diff --git a/README.md b/README.md index 40b9a51..61414a5 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,13 @@ Tested with ElasticSearch 5.6.2 and 6.2.3. Running the application, filename and index is required, to import from the terminal run: ``` -java -Xmx2g -jar excelastic-1.2.7.jar --mapping mappingName --clear +java -Xmx2g -jar excelastic-1.3.0.jar --mapping mappingName --clear ``` If running with --clear, then the existing index will be cleared before the import starts. To run with the web interface, run the following in your terminal: ``` -java -Xmx2g -jar excelastic-1.2.7.jar +java -Xmx2g -jar excelastic-1.3.0.jar ``` When the application successfully connects to the ElasticSearch server, the browser will automatically open a new tab. @@ -67,10 +67,7 @@ If no configuration file is present a new configuration file will be created usi ## Contributing -If you want to contribute to this project, open an issue or pull request. :: - -In the 1.2.7 release we have cleaned up the code and added even more javadoc -in order to promote contributions! :astonished: +If you want to contribute to this project, open an issue or pull request. :heart_eyes_cat: :metal: --- diff --git a/excelastic.png b/excelastic.png index 6100d72..dfdf506 100644 Binary files a/excelastic.png and b/excelastic.png differ diff --git a/pom.xml b/pom.xml index 7c4ab1f..a4e7d21 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ com.codingchili excelastic - 1.2.7 + 1.3.0 diff --git a/src/main/java/com/codingchili/ApplicationLauncher.java b/src/main/java/com/codingchili/ApplicationLauncher.java index c5d346b..11d275f 100644 --- a/src/main/java/com/codingchili/ApplicationLauncher.java +++ b/src/main/java/com/codingchili/ApplicationLauncher.java @@ -21,7 +21,7 @@ */ public class ApplicationLauncher { private final ApplicationLogger logger = new ApplicationLogger(getClass()); - public static String VERSION = "1.2.7"; + public static String VERSION = "1.3.0"; private Vertx vertx; public static void main(String[] args) { diff --git a/src/main/java/com/codingchili/Controller/CommandLine.java b/src/main/java/com/codingchili/Controller/CommandLine.java index 3523313..a6cc298 100644 --- a/src/main/java/com/codingchili/Controller/CommandLine.java +++ b/src/main/java/com/codingchili/Controller/CommandLine.java @@ -46,9 +46,11 @@ private void importFile(ImportEvent event, String fileName) { logger.loadingFromFilesystem(fileName); logger.parsingStarted(); try { - FileParser parser = new FileParser(new File(fileName), 1, fileName); + FileParser parser = ParserFactory.getByFilename(fileName); + parser.setFileData(fileName, 1, fileName); + event.setParser(parser); - parser.assertFileParsable(); + parser.initialize(); logger.importStarted(event.getIndex()); vertx.eventBus().send(Configuration.INDEXING_ELASTICSEARCH, event, getDeliveryOpts(), diff --git a/src/main/java/com/codingchili/Controller/Website.java b/src/main/java/com/codingchili/Controller/Website.java index f17cb9f..a77c9db 100644 --- a/src/main/java/com/codingchili/Controller/Website.java +++ b/src/main/java/com/codingchili/Controller/Website.java @@ -20,7 +20,7 @@ import static com.codingchili.ApplicationLauncher.VERSION; import static com.codingchili.Model.Configuration.INDEXING_ELASTICSEARCH; import static com.codingchili.Model.ElasticWriter.*; -import static com.codingchili.Model.FileParser.INDEX; +import static com.codingchili.Model.ExcelParser.INDEX; /** * @author Robin Duda @@ -172,8 +172,10 @@ private void parse(String uploadedFileName, MultiMap params, String fileName, Fu vertx.executeBlocking(blocking -> { try { ImportEvent event = ImportEvent.fromParams(params); - FileParser parser = new FileParser(new File(uploadedFileName), event.getOffset(), fileName); - parser.assertFileParsable(); + FileParser parser = ParserFactory.getByFilename(fileName); + parser.setFileData(uploadedFileName, event.getOffset(), fileName); + + parser.initialize(); event.setParser(parser); // submit an import event. diff --git a/src/main/java/com/codingchili/Model/CSVParser.java b/src/main/java/com/codingchili/Model/CSVParser.java new file mode 100644 index 0000000..17882bc --- /dev/null +++ b/src/main/java/com/codingchili/Model/CSVParser.java @@ -0,0 +1,220 @@ +package com.codingchili.Model; + +import io.vertx.core.json.JsonObject; +import org.reactivestreams.Subscriber; +import org.reactivestreams.Subscription; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author Robin Duda + *

+ * Parses CSV files. + */ +public class CSVParser implements FileParser { + private static final int MAX_LINE_LENGTH = 16384; + private static final int PAGE_16MB = 16777216; + + private static final char TOKEN_NULL = '\0'; + private static final char TOKEN_CR = '\r'; + private static final char TOKEN_LF = '\n'; + private static final char TOKEN_QUOTE = '\"'; + private static final char TOKEN_SEPARATOR = ','; + + private ByteBuffer buffer = ByteBuffer.allocate(MAX_LINE_LENGTH); + private JsonObject headers = new JsonObject(); + private Iterator header; + private RandomAccessFile file; + private MappedByteBuffer map; + private long fileSize; + private int index = 0; + private int rows = 0; + + @Override + public void setFileData(String localFileName, int offset, String fileName) throws FileNotFoundException { + file = new RandomAccessFile(localFileName, "rw"); + try { + map = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, PAGE_16MB); + fileSize = file.length(); + readRowCount(); + readHeaders(); + } catch (IOException e) { + throw new ParserException(e); + } + } + + @Override + public Set getSupportedFileExtensions() { + return new HashSet<>(Collections.singletonList(".csv")); + } + + @Override + public void initialize() { + index = 0; + map.position(0); + readRow(); // skip headers row. + for (int i = 0; i < rows; i++) { + readRow(); + } + } + + private int readRowCount() { + for (int i = map.position(); i < fileSize; i++) { + if (map.get(i) == '\n') { + rows++; + } + } + return rows; + } + + private void readHeaders() throws IOException { + map.position(0); + + for (int i = map.position(); i < file.length(); i++) { + if (map.get(i) == '\n') { + Arrays.stream(new String(buffer.array()).split(",")) + .map(header -> header.replaceAll("\"", "")) + .map(String::trim).forEach(header -> { + headers.put(header, ""); + }); + break; + } else { + buffer.put(map.get(i)); + } + } + buffer.clear(); + } + + private void process(AtomicInteger columnsRead, ByteBuffer buffer, JsonObject json) { + columnsRead.incrementAndGet(); + + if (columnsRead.get() > headers.size()) { + throw new ColumnsExceededHeadersException(columnsRead.get(), headers.size(), index + 1); + } else { + int read = buffer.position(); + byte[] line = new byte[read + 1]; + + buffer.position(0); + buffer.get(line, 0, read); + line[line.length - 1] = '\0'; + + json.put(header.next(), parseDatatype(line)); + buffer.clear(); + } + } + + private JsonObject readRow() { + // reset current header. + header = headers.fieldNames().iterator(); + + AtomicInteger columnsRead = new AtomicInteger(0); + JsonObject json = headers.copy(); + boolean quoted = false; + boolean done = false; + + while (!done) { + byte current = map.get(); + + switch (current) { + case TOKEN_NULL: + // EOF call process. + process(columnsRead, buffer, json); + done = true; + break; + case TOKEN_CR: + case TOKEN_LF: + // final header is being read and EOL appears. + if (columnsRead.get() == headers.size() - 1) { + process(columnsRead, buffer, json); + done = true; + break; + } else { + // skip token if not all headers read. + continue; + } + case TOKEN_QUOTE: + // toggle quoted to support commas within quotes. + quoted = !quoted; + break; + case TOKEN_SEPARATOR: + if (!quoted) { + process(columnsRead, buffer, json); + break; + } + default: + // store the current token in the buffer until the column ends. + buffer.put(current); + } + } + + if (!(columnsRead.get() == headers.size())) { + throw new ParserException( + String.format("Error at line %d, values (%d) does not match headers (%d).", + index, columnsRead.get(), headers.size())); + } else { + index++; + } + + // parse json object. + return json; + } + + private Object parseDatatype(byte[] data) { + String line = new String(data).trim(); + + if (line.matches("[0-9]*")) { + return Integer.parseInt(line); + } else if (line.matches("true|false")) { + return Boolean.parseBoolean(line); + } else { + return line; + } + } + + @Override + public int getNumberOfElements() { + return rows; + } + + @Override + public void subscribe(Subscriber subscriber) { + map.position(0); + readRow(); + index = 0; + + subscriber.onSubscribe(new Subscription() { + private boolean complete = false; + private int index = 0; + + @Override + public void request(long count) { + for (int i = 0; i < count && i < rows; i++) { + JsonObject result = readRow(); + + if (result != null) { + subscriber.onNext(result); + } else { + complete = true; + subscriber.onComplete(); + } + } + + index += count; + + if (index >= rows && !complete) { + subscriber.onComplete(); + } + } + + @Override + public void cancel() { + // send no more items! + } + }); + } +} diff --git a/src/main/java/com/codingchili/Model/ColumnsExceededHeadersException.java b/src/main/java/com/codingchili/Model/ColumnsExceededHeadersException.java new file mode 100644 index 0000000..6e6a778 --- /dev/null +++ b/src/main/java/com/codingchili/Model/ColumnsExceededHeadersException.java @@ -0,0 +1,19 @@ +package com.codingchili.Model; + +/** + * @author Robin Duda + * + * Thrown when more columns are encountered than there is headers. + */ +public class ColumnsExceededHeadersException extends ParserException { + + /** + * @param values number of values encountered + * @param headers the number of headers on the first row. + * @param index the line in the file. + */ + public ColumnsExceededHeadersException(int values, int headers, int index) { + super(String.format("Encountered too many values (%d) on row %d, expected to match headers (%d).", + values, index, headers)); + } +} diff --git a/src/main/java/com/codingchili/Model/ExcelParser.java b/src/main/java/com/codingchili/Model/ExcelParser.java new file mode 100644 index 0000000..cac3b20 --- /dev/null +++ b/src/main/java/com/codingchili/Model/ExcelParser.java @@ -0,0 +1,259 @@ +package com.codingchili.Model; + +import com.codingchili.logging.ApplicationLogger; +import io.vertx.core.json.JsonObject; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.reactivestreams.*; + +import java.io.*; +import java.util.*; +import java.util.function.Consumer; + +/** + * @author Robin Duda + *

+ * Parses xlsx files into json objects. + */ +public class ExcelParser implements FileParser { + public static final String INDEX = "index"; + private static final String OOXML = ".xlsx"; + private static final String XML97 = ".xls"; + private ApplicationLogger logger = new ApplicationLogger(getClass()); + private String fileName; + private Sheet sheet; + private int columns; + private int offset; + private int rows; + + @Override + public void setFileData(String localFileName, int offset, String fileName) + throws ParserException, FileNotFoundException { + + File file = new File(localFileName); + offset -= 1; // convert excel row number to 0-based index. + + if (file.exists()) { + try { + Workbook workbook = getWorkbook(file, fileName); + this.sheet = workbook.getSheetAt(0); + this.offset = offset; + this.fileName = fileName; + this.columns = getColumnCount(sheet.getRow(offset)); + this.rows = getItemCount(sheet, offset); + } catch (Exception e) { + if (e instanceof ParserException) { + throw (ParserException) e; + } else { + throw new ParserException(e); + } + } + } else { + throw new FileNotFoundException(file.getAbsolutePath()); + } + } + + @Override + public Set getSupportedFileExtensions() { + return new HashSet<>(Arrays.asList(OOXML, XML97)); + } + + /** + * Returns a workbook implementation based on the extension of the filname. + * + * @param file stream representing a workbook + * @param fileName the filename to determine a specific workbook implementation + * @return a workbook implentation that supports the given file format + * @throws ParserException when the file extension is unsupported + * @throws IOException when the given data is not a valid workbook + */ + private Workbook getWorkbook(File file, String fileName) throws ParserException, IOException { + if (fileName.endsWith(OOXML)) { + try { + return new XSSFWorkbook(file); + } catch (InvalidFormatException e) { + throw new ParserException(e); + } + } else if (fileName.endsWith(XML97)) { + return new HSSFWorkbook(new FileInputStream(file)); + } else { + throw new ParserException( + String.format("Unrecognized file extension for file %s, expected %s or %s.", + fileName, OOXML, XML97)); + } + } + + @Override + public void initialize() { + logger.parsingFile(fileName, offset); + + // parse all rows. + readRows((json) -> { + // skip storing the results of the parse. + }, offset, rows, true); + + logger.parsedFile(rows - 1, fileName); + } + + @Override + public void subscribe(Subscriber subscriber) { + subscriber.onSubscribe(new Subscription() { + private int index = 0; + + @Override + public void request(long count) { + readRows(subscriber::onNext, index, count, false); + index += count; + + if (index >= rows) { + subscriber.onComplete(); + } + } + + @Override + public void cancel() { + // send no more items! + } + }); + } + + /** + * Parses the given portion of the excel file, this saves memory as the whole file + * does not need to be stored in memory as JSON at once. + * + * @param begin the offset from the starting row. + * @param count the number of lines to parse. + * @param consumer processor of json items. + */ + public void parseRowRange(int begin, int count, Consumer consumer) { + readRows(consumer, begin, begin + count, false); + } + + + @Override + public int getNumberOfElements() { + return rows; + } + + /** + * Reads the given range of rows and converts it to json. + * + * @param start the starting element, 0 represents the first row after the row with the column titles. + * @param count the number of elements to read - can never read past the max number of rows. + * @param consumer called with the produced JSON object for each parsed row. + */ + private void readRows(Consumer consumer, int start, long count, boolean dryRun) { + String[] columns = getColumns(sheet.getRow(offset)); + + for (int i = start; i < (count + start) && i < rows; i++) { + consumer.accept(getRow(columns, sheet.getRow(i + offset + 1), dryRun)); + } + } + + /** + * retrieves the values of the column titles. + * + * @param row that points to the column titles. + * @return an array of the titles + */ + private String[] getColumns(Row row) { + String[] titles = new String[columns]; + + for (int i = 0; i < titles.length; i++) { + titles[i] = row.getCell(i).getStringCellValue(); + } + return titles; + } + + /** + * Returns the number of columns present on the given row. + * + * @param row the row to read column count from. + * @return the number of columns on the given row + */ + private int getColumnCount(Row row) { + DataFormatter formatter = new DataFormatter(); + Iterator iterator = row.iterator(); + int count = 0; + + while (iterator.hasNext()) { + Cell cell = iterator.next(); + String value = formatter.formatCellValue(cell); + + if (value.length() > 0) { + count++; + } else { + break; + } + } + return count; + } + + /** + * counts the number of rows to be imported taking into account the offset + * of the title columns. + * + * @param sheet the sheet to read items from + * @param offset the offset of the title columns + * @return the number of rows minus the column title offset. + */ + private int getItemCount(Sheet sheet, int offset) { + int count = 0; + Row row = sheet.getRow(offset + 1); + + while (row != null) { + count++; + row = sheet.getRow(offset + 1 + count); + } + + return count; + } + + /** + * retrieves a row as a json object. + * + * @param titles the titles of the row. + * @param row the row to read values from. + * @param dryRun if true no results will be generated and this method returns null. + * @return a jsonobject that maps titles to the column values. + */ + private JsonObject getRow(String[] titles, Row row, boolean dryRun) { + DataFormatter formatter = new DataFormatter(); + JsonObject json = null; + int index = 0; + + if (!dryRun) { + json = new JsonObject(); + } + + for (int i = 0; i < row.getLastCellNum(); i++) { + Cell cell = row.getCell(i); + Object value = null; + + if (cell != null) { + switch (cell.getCellTypeEnum()) { + case STRING: + value = formatter.formatCellValue(cell); + break; + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + value = cell.getDateCellValue().toInstant().toString(); + } else { + value = cell.getNumericCellValue(); + } + break; + } + // avoid indexing null or empty string, fails to index rows + // when date fields are empty and can lead to mappings being + // set up incorrectly if leading rows has missing data. + if (!dryRun && value != null && !(value.toString().length() == 0)) { + json.put(titles[index], value); + } + } + index++; + } + return json; + } +} diff --git a/src/main/java/com/codingchili/Model/FileParser.java b/src/main/java/com/codingchili/Model/FileParser.java index f120cdb..99b8396 100644 --- a/src/main/java/com/codingchili/Model/FileParser.java +++ b/src/main/java/com/codingchili/Model/FileParser.java @@ -1,260 +1,42 @@ package com.codingchili.Model; -import com.codingchili.logging.ApplicationLogger; import io.vertx.core.json.JsonObject; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.openxml4j.exceptions.InvalidFormatException; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.reactivestreams.*; +import org.reactivestreams.Publisher; -import java.io.*; -import java.util.Iterator; -import java.util.function.Consumer; +import java.io.FileNotFoundException; +import java.util.Set; /** * @author Robin Duda *

- * Parses xlsx files into json objects. + * Interface used to support different input file formats. + * The parser is subscribable and emits json objects for importing. */ -public class FileParser implements Publisher { - public static final String INDEX = "index"; - private static final String OOXML = ".xlsx"; - private static final String XML97 = ".xls"; - private ApplicationLogger logger = new ApplicationLogger(getClass()); - private String fileName; - private Sheet sheet; - private int columns; - private int offset; - private int rows; +public interface FileParser extends Publisher { /** - * Parses the contents of an excel into JSON. - * - * @param file file of an excel file. - * @param offset row number containing column titles. + * @param localFileName a file on disk to be parsed, do not read this into memory + * as it could be potentially very large. + * @param offset indicates how many empty rows to skip before finding the titles. + * @param fileName the original name of the file to be imported. */ - public FileParser(File file, int offset, String fileName) throws ParserException, FileNotFoundException { - offset -= 1; // convert excel row number to 0-based index. - - if (file.exists()) { - try { - Workbook workbook = getWorkbook(file, fileName); - this.sheet = workbook.getSheetAt(0); - this.offset = offset; - this.fileName = fileName; - this.columns = getColumnCount(sheet.getRow(offset)); - this.rows = getItemCount(sheet, offset); - } catch (Exception e) { - if (e instanceof ParserException) { - throw (ParserException) e; - } else { - throw new ParserException(e); - } - } - } else { - throw new FileNotFoundException(file.getAbsolutePath()); - } - } + void setFileData(String localFileName, int offset, String fileName) throws FileNotFoundException; /** - * Returns a workbook implementation based on the extension of the filname. - * - * @param file stream representing a workbook - * @param fileName the filename to determine a specific workbook implementation - * @return a workbook implentation that supports the given file format - * @throws ParserException when the file extension is unsupported - * @throws IOException when the given data is not a valid workbook + * @return a set of file extensions that this fileparser supports. */ - private Workbook getWorkbook(File file, String fileName) throws ParserException, IOException { - if (fileName.endsWith(OOXML)) { - try { - return new XSSFWorkbook(file); - } catch (InvalidFormatException e) { - throw new ParserException(e); - } - } else if (fileName.endsWith(XML97)) { - return new HSSFWorkbook(new FileInputStream(file)); - } else { - throw new ParserException( - String.format("Unrecognized file extension for file %s, expected %s or %s.", - fileName, OOXML, XML97)); - } - } - - @Override - public void subscribe(Subscriber subscriber) { - subscriber.onSubscribe(new Subscription() { - private int index = 0; - - @Override - public void request(long count) { - readRows(index, count, subscriber::onNext, false); - index += count; - - if (index >= rows) { - subscriber.onComplete(); - } - } - - @Override - public void cancel() { - // send no more items! - } - }); - } + Set getSupportedFileExtensions(); /** * Parses the excel file to make sure that it is parseable without allocating memory - * for the result. This should be called before{@link #parseRowRange(int, int, Consumer)} to make + * for the result. This should be called before importing to make * sure any imports does not fail halfway through. */ - public void assertFileParsable() { - logger.parsingFile(fileName, offset); - - // parse all rows. - readRows(offset, rows, (json) -> { - // skip storing the results of the parse. - }, true); - logger.parsedFile(rows - 1, fileName); - } - - /** - * Parses the given portion of the excel file, this saves memory as the whole file - * does not need to be stored in memory as JSON at once. - * - * @param begin the offset from the starting row. - * @param count the number of lines to parse. - * @param consumer processor of json items. - */ - public void parseRowRange(int begin, int count, Consumer consumer) { - readRows(begin, begin + count, consumer, false); - } + void initialize(); /** * @return the number of elements that was parsed. */ - public int getNumberOfElements() { - return rows; - } - - /** - * Reads the given range of rows and converts it to json. - * - * @param start the starting element, 0 represents the first row after the row with the column titles. - * @param count the number of elements to read - can never read past the max number of rows. - * @param consumer called with the produced JSON object for each parsed row. - */ - private void readRows(int start, long count, Consumer consumer, boolean dryRun) { - String[] columns = getColumns(sheet.getRow(offset)); - - for (int i = start; i < (count + start) && i < rows; i++) { - consumer.accept(getRow(columns, sheet.getRow(i + offset + 1), dryRun)); - } - } - - /** - * retrieves the values of the column titles. - * - * @param row that points to the column titles. - * @return an array of the titles - */ - private String[] getColumns(Row row) { - String[] titles = new String[columns]; - - for (int i = 0; i < titles.length; i++) { - titles[i] = row.getCell(i).getStringCellValue(); - } - return titles; - } - - /** - * Returns the number of columns present on the given row. - * - * @param row the row to read column count from. - * @return the number of columns on the given row - */ - private int getColumnCount(Row row) { - DataFormatter formatter = new DataFormatter(); - Iterator iterator = row.iterator(); - int count = 0; - - while (iterator.hasNext()) { - Cell cell = iterator.next(); - String value = formatter.formatCellValue(cell); - - if (value.length() > 0) { - count++; - } else { - break; - } - } - return count; - } - - /** - * counts the number of rows to be imported taking into account the offset - * of the title columns. - * - * @param sheet the sheet to read items from - * @param offset the offset of the title columns - * @return the number of rows minus the column title offset. - */ - private int getItemCount(Sheet sheet, int offset) { - int count = 0; - Row row = sheet.getRow(offset + 1); - - while (row != null) { - count++; - row = sheet.getRow(offset + 1 + count); - } - - return count; - } - - /** - * retrieves a row as a json object. - * - * @param titles the titles of the row. - * @param row the row to read values from. - * @param dryRun if true no results will be generated and this method returns null. - * @return a jsonobject that maps titles to the column values. - */ - private JsonObject getRow(String[] titles, Row row, boolean dryRun) { - DataFormatter formatter = new DataFormatter(); - JsonObject json = null; - int index = 0; - - if (!dryRun) { - json = new JsonObject(); - } - - for (int i = 0; i < row.getLastCellNum(); i++) { - Cell cell = row.getCell(i); - Object value = null; + int getNumberOfElements(); - if (cell != null) { - switch (cell.getCellTypeEnum()) { - case STRING: - value = formatter.formatCellValue(cell); - break; - case NUMERIC: - if (DateUtil.isCellDateFormatted(cell)) { - value = cell.getDateCellValue().toInstant().toString(); - } else { - value = cell.getNumericCellValue(); - } - break; - } - // avoid indexing null or empty string, fails to index rows - // when date fields are empty and can lead to mappings being - // set up incorrectly if leading rows has missing data. - if (!dryRun && value != null && !(value.toString().length() == 0)) { - json.put(titles[index], value); - } - } - index++; - } - return json; - } } diff --git a/src/main/java/com/codingchili/Model/ImportEvent.java b/src/main/java/com/codingchili/Model/ImportEvent.java index 56e7a3c..b952f09 100644 --- a/src/main/java/com/codingchili/Model/ImportEvent.java +++ b/src/main/java/com/codingchili/Model/ImportEvent.java @@ -6,7 +6,7 @@ import java.util.Optional; import static com.codingchili.Controller.Website.UPLOAD_ID; -import static com.codingchili.Model.FileParser.INDEX; +import static com.codingchili.Model.ExcelParser.INDEX; /** * @author Robin Duda @@ -53,7 +53,7 @@ public static ImportEvent fromCommandLineArgs(String[] args) { return new ImportEvent() .setIndex(args[1]) .setOffset(getArgParamValue(args, ARG_OFFSET).map(Integer::parseInt).orElse(1)) - .setClearExisting(Arrays.stream(args).anyMatch(param -> param.equals(ARG_CLEAR))) + .setClearExisting(Arrays.asList(args).contains(ARG_CLEAR)) .setMapping(getArgParamValue(args, ARG_MAPPING).orElse("default")); } diff --git a/src/main/java/com/codingchili/Model/ImportEventCodec.java b/src/main/java/com/codingchili/Model/ImportEventCodec.java index c05a7ce..7bcfac5 100644 --- a/src/main/java/com/codingchili/Model/ImportEventCodec.java +++ b/src/main/java/com/codingchili/Model/ImportEventCodec.java @@ -7,7 +7,7 @@ /** * @author Robin Duda *

- * This codec is used to transfer a {@link FileParser} reference over the local event bus. + * This codec is used to transfer a {@link ExcelParser} reference over the local event bus. */ public class ImportEventCodec implements MessageCodec { diff --git a/src/main/java/com/codingchili/Model/InvalidFileNameException.java b/src/main/java/com/codingchili/Model/InvalidFileNameException.java new file mode 100644 index 0000000..e0b78e2 --- /dev/null +++ b/src/main/java/com/codingchili/Model/InvalidFileNameException.java @@ -0,0 +1,16 @@ +package com.codingchili.Model; + +/** + * @author Robin Duda + * + * Thrown when an invalid filename has been specified. + */ +public class InvalidFileNameException extends RuntimeException { + + /** + * @param fileName the full filename. + */ + public InvalidFileNameException(String fileName) { + super(String.format("File with name '%s' is missing extension.", fileName)); + } +} diff --git a/src/main/java/com/codingchili/Model/ParserFactory.java b/src/main/java/com/codingchili/Model/ParserFactory.java new file mode 100644 index 0000000..e90a45d --- /dev/null +++ b/src/main/java/com/codingchili/Model/ParserFactory.java @@ -0,0 +1,60 @@ +package com.codingchili.Model; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Supplier; + +/** + * @author Robin Duda + *

+ * Handles support for multiple file formats. + */ +public class ParserFactory { + private static final Map> parsers = new ConcurrentHashMap<>(); + + static { + register(ExcelParser::new); + register(CSVParser::new); + } + + /** + * @param parser the given parser is instantiated and registered for use with its + * supported file extensions. + */ + public static void register(Supplier parser) { + for (String ext : parser.get().getSupportedFileExtensions()) { + parsers.put(ext, parser); + } + } + + /** + * Retrieves a file parser that is registered for the file extension in the given filename. + * + * @param fileName a filename that contains an extension. + * @return a parser that is registered for use with the given extension, throws an + * exception if no parser exists or if the file does not have an extension. + */ + public static FileParser getByFilename(String fileName) { + int extensionAt = fileName.lastIndexOf("."); + + if (extensionAt > 0) { + // include the dot separator in the extension. + String extension = fileName.substring(extensionAt); + + if (parsers.containsKey(extension)) { + return parsers.get(extension).get(); + } else { + throw new UnsupportedFileTypeException(extension); + } + } else { + throw new InvalidFileNameException(fileName); + } + } + + /** + * @return a list of file extensions that is registered in the parser factory. + */ + public static Set getSupportedExtensions() { + return parsers.keySet(); + } +} diff --git a/src/main/java/com/codingchili/Model/UnsupportedFileTypeException.java b/src/main/java/com/codingchili/Model/UnsupportedFileTypeException.java new file mode 100644 index 0000000..8edf9ea --- /dev/null +++ b/src/main/java/com/codingchili/Model/UnsupportedFileTypeException.java @@ -0,0 +1,16 @@ +package com.codingchili.Model; + +/** + * @author Robin Duda + * + * Thrown when a parser has not been registered for the given file extension. + */ +public class UnsupportedFileTypeException extends RuntimeException { + + /** + * @param extension the file extension that was unsupported. + */ + public UnsupportedFileTypeException(String extension) { + super(String.format("Missing parser for file extension '%s'.", extension)); + } +} diff --git a/src/main/resources/templates/index.jade b/src/main/resources/templates/index.jade index fcae9e1..5276b06 100644 --- a/src/main/resources/templates/index.jade +++ b/src/main/resources/templates/index.jade @@ -25,16 +25,16 @@ html(lang='en') input#uploadId(hidden='true', value='', name='uploadId') fieldset .form-group - label.col-lg-2.control-label(for='index') Index - .col-lg-10 + label.col-lg-3.control-label(for='index') Index + .col-lg-9 input#index.form-control(type='text', name='index', placeholder='generate date') .form-group - label.col-lg-2.control-label(for='mapping') Mapping - .col-lg-10 + label.col-lg-3.control-label(for='mapping') Mapping + .col-lg-9 input#index.form-control(type='text', name='mapping', placeholder='default') .form-group - label.col-lg-2.control-label(for='offset') Title-row - .col-lg-10 + label.col-lg-3.control-label(for='offset') Title-row (excel) + .col-lg-9 input#offset.form-control(type='text', name='offset', value='1') .form-group label.col-lg-2.control-label(for='clear') diff --git a/src/test/java/TestParser.java b/src/test/java/TestParser.java deleted file mode 100644 index f2654b0..0000000 --- a/src/test/java/TestParser.java +++ /dev/null @@ -1,69 +0,0 @@ -import com.codingchili.Model.FileParser; -import com.codingchili.Model.ParserException; -import io.vertx.core.json.JsonArray; -import io.vertx.core.json.JsonObject; -import io.vertx.ext.unit.TestContext; -import io.vertx.ext.unit.junit.VertxUnitRunner; -import org.junit.Test; -import org.junit.runner.RunWith; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Paths; - -/** - * @author Robin Duda - */ -@RunWith(VertxUnitRunner.class) -public class TestParser { - public static final String TEST_XLSX_FILE = "src/test/java/test.xlsx"; - public static final String TEST_XLS_FILE = "src/test/java/test.xls"; - public static final String TEST_INVALID_FILE = "src/test/java/invalid.xlsx"; - public static final int ROW_OFFSET = 5; - private static final String XLSX = ".xlsx"; - - @Test - public void failParseInvalid() throws Exception { - try { - new FileParser(new File(TEST_INVALID_FILE), 5, XLSX); - throw new Exception("Should fail for invalid bytes."); - } catch (ParserException ignored) { - } - } - - @Test - public void testParseOOXML(TestContext context) throws IOException, ParserException { - testParseFile(context, TEST_XLSX_FILE); - } - - @Test - public void testParse2007(TestContext context) throws IOException, ParserException { - testParseFile(context, TEST_XLS_FILE); - } - - private void testParseFile(TestContext context, String fileName) throws IOException, ParserException { - FileParser parser = new FileParser( - Paths.get(fileName).toFile(), - ROW_OFFSET, - fileName - ); - - parser.assertFileParsable(); - - JsonArray list = new JsonArray(); - parser.parseRowRange(0, parser.getNumberOfElements(), list::add); - - context.assertEquals(2, list.size()); - - for (int i = 0; i < list.size(); i++) { - JsonObject json = list.getJsonObject(i); - context.assertTrue(json.containsKey("Column 1")); - context.assertTrue(json.containsKey("Column 2")); - context.assertTrue(json.containsKey("Column 3")); - - context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 1, json.getString("Column 1")); - context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 2, json.getString("Column 2")); - context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 3, json.getString("Column 3")); - } - } -} diff --git a/src/test/java/TestConfiguration.java b/src/test/java/com/codingchili/TestConfiguration.java similarity index 95% rename from src/test/java/TestConfiguration.java rename to src/test/java/com/codingchili/TestConfiguration.java index 3be557c..88a411d 100644 --- a/src/test/java/TestConfiguration.java +++ b/src/test/java/com/codingchili/TestConfiguration.java @@ -1,21 +1,23 @@ -import com.codingchili.Model.Configuration; -import io.vertx.ext.unit.TestContext; -import io.vertx.ext.unit.junit.VertxUnitRunner; -import org.junit.Test; -import org.junit.runner.RunWith; - -/** - * @author Robin Duda - */ - -@RunWith(VertxUnitRunner.class) -public class TestConfiguration { - - @Test - public void shouldLoadConfiguration(TestContext context) { - context.assertNotNull(Configuration.getWebPort()); - context.assertNotNull(Configuration.getElasticPort()); - context.assertNotNull(Configuration.getElasticHost()); - } - -} +package com.codingchili; + +import com.codingchili.Model.Configuration; +import io.vertx.ext.unit.TestContext; +import io.vertx.ext.unit.junit.VertxUnitRunner; +import org.junit.Test; +import org.junit.runner.RunWith; + +/** + * @author Robin Duda + */ + +@RunWith(VertxUnitRunner.class) +public class TestConfiguration { + + @Test + public void shouldLoadConfiguration(TestContext context) { + context.assertNotNull(Configuration.getWebPort()); + context.assertNotNull(Configuration.getElasticPort()); + context.assertNotNull(Configuration.getElasticHost()); + } + +} diff --git a/src/test/java/com/codingchili/TestParser.java b/src/test/java/com/codingchili/TestParser.java new file mode 100644 index 0000000..8592847 --- /dev/null +++ b/src/test/java/com/codingchili/TestParser.java @@ -0,0 +1,111 @@ +package com.codingchili; + +import com.codingchili.Model.*; +import io.vertx.core.json.JsonArray; +import io.vertx.core.json.JsonObject; +import io.vertx.ext.unit.TestContext; +import io.vertx.ext.unit.junit.VertxUnitRunner; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.reactivestreams.Subscriber; +import org.reactivestreams.Subscription; + +import java.io.IOException; + +/** + * @author Robin Duda + */ +@RunWith(VertxUnitRunner.class) +public class TestParser { + private static final String TEST_XLSX_FILE = "/test.xlsx"; + static final String TEST_XLS_FILE = "/test.xls"; + private static final String TEST_INVALID_FILE = "/invalid.xlsx"; + static final int ROW_OFFSET = 5; + private static final String XLSX = ".xlsx"; + private static final String TEST_CSV = "/test.csv"; + + @Test + public void failParseInvalid() throws Exception { + try { + new ExcelParser().setFileData(toPath(TEST_INVALID_FILE), 5, XLSX); + throw new Exception("Should fail for invalid bytes."); + } catch (ParserException ignored) { + } + } + + @Test(expected = InvalidFileNameException.class) + public void testParseMissingExt() { + ParserFactory.getByFilename("file"); + } + + @Test(expected = UnsupportedFileTypeException.class) + public void testParseMissingParser() { + ParserFactory.getByFilename("file.xxx"); + } + + @Test + public void testParseOOXML(TestContext context) throws IOException { + testParseFile(context, TEST_XLSX_FILE); + } + + @Test + public void testParse2007(TestContext context) throws IOException { + testParseFile(context, TEST_XLS_FILE); + } + + @Test + public void testParseCSV(TestContext context) throws IOException { + testParseFile(context, TEST_CSV); + } + + private void testParseFile(TestContext context, String fileName) throws IOException, ParserException { + FileParser parser = ParserFactory.getByFilename(fileName); + parser.setFileData( + toPath(fileName), + ROW_OFFSET, + fileName + ); + + parser.initialize(); + + parser.subscribe(new Subscriber() { + JsonArray list = new JsonArray(); + + + @Override + public void onSubscribe(Subscription subscription) { + subscription.request(3); + } + + @Override + public void onNext(JsonObject entry) { + list.add(entry); + } + + @Override + public void onError(Throwable throwable) { + + } + + @Override + public void onComplete() { + context.assertEquals(2, list.size()); + + for (int i = 0; i < list.size(); i++) { + JsonObject json = list.getJsonObject(i); + context.assertTrue(json.containsKey("Column 1")); + context.assertTrue(json.containsKey("Column 2")); + context.assertTrue(json.containsKey("Column 3")); + + context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 1, json.getString("Column 1")); + context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 2, json.getString("Column 2")); + context.assertEquals("cell " + (ROW_OFFSET + 1 + i) + "." + 3, json.getString("Column 3")); + } + } + }); + } + + private static String toPath(String resource) { + return TestParser.class.getResource(resource).getPath(); + } +} diff --git a/src/test/java/TestWebsite.java b/src/test/java/com/codingchili/TestWebsite.java similarity index 96% rename from src/test/java/TestWebsite.java rename to src/test/java/com/codingchili/TestWebsite.java index 8886904..e5bc3ca 100644 --- a/src/test/java/TestWebsite.java +++ b/src/test/java/com/codingchili/TestWebsite.java @@ -1,81 +1,83 @@ -import com.codingchili.Controller.Website; -import com.codingchili.Model.Configuration; -import io.vertx.core.Vertx; -import io.vertx.core.json.JsonObject; -import io.vertx.ext.unit.Async; -import io.vertx.ext.unit.TestContext; -import io.vertx.ext.unit.junit.Timeout; -import io.vertx.ext.unit.junit.VertxUnitRunner; -import org.junit.*; -import org.junit.runner.RunWith; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; - -/** - * @author Robin Duda - */ -@RunWith(VertxUnitRunner.class) -public class TestWebsite { - private Vertx vertx; - - @Before - public void setUp(TestContext context) { - vertx = Vertx.vertx(); - vertx.deployVerticle(new Website(), context.asyncAssertSuccess()); - } - - @After - public void tearDown(TestContext context) { - vertx.close(context.asyncAssertSuccess()); - } - - @Rule - public Timeout timeout = Timeout.seconds(5); - - @Test - public void shouldGetStartPage(TestContext context) { - Async async = context.async(); - - vertx.createHttpClient().getNow(Configuration.getWebPort(), "localhost", "/", response -> { - context.assertEquals(200, response.statusCode()); - async.complete(); - }); - } - - @Ignore("The file must be recognized as a file on the server side, test broken.") - public void shouldSucceedUpload(TestContext context) throws IOException { - Async async = context.async(); - - vertx.createHttpClient().post(Configuration.getWebPort(), "localhost", "/api/upload", response -> { - response.bodyHandler(body -> { - context.assertTrue(body.toString().contains("Done")); - context.assertEquals(200, response.statusCode()); - async.complete(); - }); - }).putHeader("content-type", "multipart/form-data").end(new JsonObject() - .put("index", "test") - .put("offset", 5) - .put("file", getFileBytes()) - .encode()); - } - - private byte[] getFileBytes() throws IOException { - return Files.readAllBytes(Paths.get("src/test/java/test.xlsx")); - } - - @Test - public void shouldFailUpload(TestContext context) { - Async async = context.async(); - - vertx.createHttpClient().post(Configuration.getWebPort(), "localhost", "/api/upload", response -> { - response.bodyHandler(body -> { - context.assertTrue(body.toString().contains("error")); - context.assertEquals(200, response.statusCode()); - async.complete(); - }); - }).end(); - } - -} +package com.codingchili; + +import com.codingchili.Controller.Website; +import com.codingchili.Model.Configuration; +import io.vertx.core.Vertx; +import io.vertx.core.json.JsonObject; +import io.vertx.ext.unit.Async; +import io.vertx.ext.unit.TestContext; +import io.vertx.ext.unit.junit.Timeout; +import io.vertx.ext.unit.junit.VertxUnitRunner; +import org.junit.*; +import org.junit.runner.RunWith; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +/** + * @author Robin Duda + */ +@RunWith(VertxUnitRunner.class) +public class TestWebsite { + private Vertx vertx; + + @Before + public void setUp(TestContext context) { + vertx = Vertx.vertx(); + vertx.deployVerticle(new Website(), context.asyncAssertSuccess()); + } + + @After + public void tearDown(TestContext context) { + vertx.close(context.asyncAssertSuccess()); + } + + @Rule + public Timeout timeout = Timeout.seconds(5); + + @Test + public void shouldGetStartPage(TestContext context) { + Async async = context.async(); + + vertx.createHttpClient().getNow(Configuration.getWebPort(), "localhost", "/", response -> { + context.assertEquals(200, response.statusCode()); + async.complete(); + }); + } + + @Ignore("The file must be recognized as a file on the server side, test broken.") + public void shouldSucceedUpload(TestContext context) throws IOException { + Async async = context.async(); + + vertx.createHttpClient().post(Configuration.getWebPort(), "localhost", "/api/upload", response -> { + response.bodyHandler(body -> { + context.assertTrue(body.toString().contains("Done")); + context.assertEquals(200, response.statusCode()); + async.complete(); + }); + }).putHeader("content-type", "multipart/form-data").end(new JsonObject() + .put("index", "test") + .put("offset", 5) + .put("file", getFileBytes()) + .encode()); + } + + private byte[] getFileBytes() throws IOException { + return Files.readAllBytes(Paths.get("src/test/java/test.xlsx")); + } + + @Test + public void shouldFailUpload(TestContext context) { + Async async = context.async(); + + vertx.createHttpClient().post(Configuration.getWebPort(), "localhost", "/api/upload", response -> { + response.bodyHandler(body -> { + context.assertTrue(body.toString().contains("error")); + context.assertEquals(200, response.statusCode()); + async.complete(); + }); + }).end(); + } + +} diff --git a/src/test/java/TestWriter.java b/src/test/java/com/codingchili/TestWriter.java similarity index 83% rename from src/test/java/TestWriter.java rename to src/test/java/com/codingchili/TestWriter.java index dd75d03..065b5cf 100644 --- a/src/test/java/TestWriter.java +++ b/src/test/java/com/codingchili/TestWriter.java @@ -1,66 +1,64 @@ -import com.codingchili.Model.*; -import io.vertx.core.Vertx; -import io.vertx.core.json.JsonArray; -import io.vertx.core.json.JsonObject; -import io.vertx.ext.unit.Async; -import io.vertx.ext.unit.TestContext; -import io.vertx.ext.unit.junit.Timeout; -import io.vertx.ext.unit.junit.VertxUnitRunner; -import org.junit.After; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; - -/** - * @author Robin Duda - */ -@RunWith(VertxUnitRunner.class) -public class TestWriter { - private Vertx vertx; - - @Before - public void setUp(TestContext context) { - vertx = Vertx.vertx(); - ImportEventCodec.registerOn(vertx); - vertx.deployVerticle(new ElasticWriter(), context.asyncAssertSuccess()); - } - - @Rule - public Timeout timeout = Timeout.seconds(5); - - @After - public void tearDown(TestContext context) { - vertx.close(context.asyncAssertSuccess()); - } - - @Test - public void shouldWriteToElasticPort(TestContext context) throws IOException { - Async async = context.async(); - - vertx.createHttpServer().requestHandler(request -> { - - request.bodyHandler(body -> { - context.assertTrue(body.toString() != null); - async.complete(); - }); - }).listen(Configuration.getElasticPort()); - - FileParser fileParser = new FileParser( - Paths.get(TestParser.TEST_XLS_FILE).toFile(), - TestParser.ROW_OFFSET, - "testFileName.xls"); - - vertx.eventBus().send(Configuration.INDEXING_ELASTICSEARCH, new ImportEvent() - .setParser(fileParser) - .setIndex("text-index") - .setClearExisting(false) - .setMapping("test-mapping")); - } - -} +package com.codingchili; + +import com.codingchili.Model.*; +import io.vertx.core.Vertx; +import io.vertx.ext.unit.Async; +import io.vertx.ext.unit.TestContext; +import io.vertx.ext.unit.junit.Timeout; +import io.vertx.ext.unit.junit.VertxUnitRunner; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; + +import java.io.IOException; +import java.nio.file.Paths; + +/** + * @author Robin Duda + */ +@RunWith(VertxUnitRunner.class) +public class TestWriter { + private Vertx vertx; + + @Before + public void setUp(TestContext context) { + vertx = Vertx.vertx(); + ImportEventCodec.registerOn(vertx); + vertx.deployVerticle(new ElasticWriter(), context.asyncAssertSuccess()); + } + + @Rule + public Timeout timeout = Timeout.seconds(5); + + @After + public void tearDown(TestContext context) { + vertx.close(context.asyncAssertSuccess()); + } + + @Test + public void shouldWriteToElasticPort(TestContext context) throws IOException { + Async async = context.async(); + + vertx.createHttpServer().requestHandler(request -> { + + request.bodyHandler(body -> { + context.assertTrue(body.toString() != null); + async.complete(); + }); + }).listen(Configuration.getElasticPort()); + + ExcelParser fileParser = new ExcelParser(); + fileParser.setFileData(getClass().getResource(TestParser.TEST_XLS_FILE).getPath(), + TestParser.ROW_OFFSET, + "testFileName.xls"); + + vertx.eventBus().send(Configuration.INDEXING_ELASTICSEARCH, new ImportEvent() + .setParser(fileParser) + .setIndex("text-index") + .setClearExisting(false) + .setMapping("test-mapping")); + } + +} diff --git a/src/test/java/invalid.xlsx b/src/test/resources/invalid.xlsx similarity index 100% rename from src/test/java/invalid.xlsx rename to src/test/resources/invalid.xlsx diff --git a/src/test/resources/test.csv b/src/test/resources/test.csv new file mode 100644 index 0000000..6306f62 --- /dev/null +++ b/src/test/resources/test.csv @@ -0,0 +1,3 @@ +Column 1, Column 2, Column 3 +cell 6.1, cell 6.2, cell 6.3 +cell 7.1, cell 7.2, cell 7.3 \ No newline at end of file diff --git a/src/test/java/test.xls b/src/test/resources/test.xls similarity index 100% rename from src/test/java/test.xls rename to src/test/resources/test.xls diff --git a/src/test/java/test.xlsx b/src/test/resources/test.xlsx similarity index 100% rename from src/test/java/test.xlsx rename to src/test/resources/test.xlsx