Add support for parsing CSV files.

codingchili · Oct 28, 2018 · 70343a4 · 70343a4
1 parent e30e468
commit 70343a4
Show file tree

Hide file tree

Showing 25 changed files with 913 additions and 493 deletions.
diff --git a/README.md b/README.md
@@ -27,13 +27,13 @@ Tested with ElasticSearch 5.6.2 and 6.2.3.
 
 Running the application, filename and index is required, to import from the terminal run:
 ```
-java -Xmx2g -jar excelastic-1.2.7.jar <fileName> <indexName> --mapping mappingName --clear
+java -Xmx2g -jar excelastic-1.3.0.jar <fileName> <indexName> --mapping mappingName --clear
 ```
 If running with --clear, then the existing index will be cleared before the import starts.
 
 To run with the web interface, run the following in your terminal:
 ```
-java -Xmx2g -jar excelastic-1.2.7.jar
+java -Xmx2g -jar excelastic-1.3.0.jar
 ```
 When the application successfully connects to the ElasticSearch server, the browser will automatically open a new tab.
 
@@ -67,10 +67,7 @@ If no configuration file is present a new configuration file will be created usi
 
 ## Contributing
 
-If you want to contribute to this project, open an issue or pull request. ::
-
-In the 1.2.7 release we have cleaned up the code and added even more javadoc
-in order to promote contributions! :astonished:
+If you want to contribute to this project, open an issue or pull request. :heart_eyes_cat: :metal:
 
 ---
 

diff --git a/excelastic.png b/excelastic.png
diff --git a/pom.xml b/pom.xml
@@ -10,7 +10,7 @@
 
     <groupId>com.codingchili</groupId>
     <artifactId>excelastic</artifactId>
-    <version>1.2.7</version>
+    <version>1.3.0</version>
     <build>
         <plugins>
             <plugin>

diff --git a/src/main/java/com/codingchili/ApplicationLauncher.java b/src/main/java/com/codingchili/ApplicationLauncher.java
@@ -21,7 +21,7 @@
  */
 public class ApplicationLauncher {
     private final ApplicationLogger logger = new ApplicationLogger(getClass());
-    public static String VERSION = "1.2.7";
+    public static String VERSION = "1.3.0";
     private Vertx vertx;
 
     public static void main(String[] args) {

diff --git a/src/main/java/com/codingchili/Controller/CommandLine.java b/src/main/java/com/codingchili/Controller/CommandLine.java
@@ -46,9 +46,11 @@ private void importFile(ImportEvent event, String fileName) {
         logger.loadingFromFilesystem(fileName);
         logger.parsingStarted();
         try {
-            FileParser parser = new FileParser(new File(fileName), 1, fileName);
+            FileParser parser = ParserFactory.getByFilename(fileName);
+            parser.setFileData(fileName, 1, fileName);
+
             event.setParser(parser);
-            parser.assertFileParsable();
+            parser.initialize();
 
             logger.importStarted(event.getIndex());
             vertx.eventBus().send(Configuration.INDEXING_ELASTICSEARCH, event, getDeliveryOpts(),

diff --git a/src/main/java/com/codingchili/Controller/Website.java b/src/main/java/com/codingchili/Controller/Website.java
@@ -20,7 +20,7 @@
 import static com.codingchili.ApplicationLauncher.VERSION;
 import static com.codingchili.Model.Configuration.INDEXING_ELASTICSEARCH;
 import static com.codingchili.Model.ElasticWriter.*;
-import static com.codingchili.Model.FileParser.INDEX;
+import static com.codingchili.Model.ExcelParser.INDEX;
 
 /**
  * @author Robin Duda
@@ -172,8 +172,10 @@ private void parse(String uploadedFileName, MultiMap params, String fileName, Fu
         vertx.executeBlocking(blocking -> {
             try {
                 ImportEvent event = ImportEvent.fromParams(params);
-                FileParser parser = new FileParser(new File(uploadedFileName), event.getOffset(), fileName);
-                parser.assertFileParsable();
+                FileParser parser = ParserFactory.getByFilename(fileName);
+                parser.setFileData(uploadedFileName, event.getOffset(), fileName);
+
+                parser.initialize();
                 event.setParser(parser);
 
                 // submit an import event.

diff --git a/src/main/java/com/codingchili/Model/CSVParser.java b/src/main/java/com/codingchili/Model/CSVParser.java
@@ -0,0 +1,220 @@
+package com.codingchili.Model;
+
+import io.vertx.core.json.JsonObject;
+import org.reactivestreams.Subscriber;
+import org.reactivestreams.Subscription;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * @author Robin Duda
+ * <p>
+ * Parses CSV files.
+ */
+public class CSVParser implements FileParser {
+    private static final int MAX_LINE_LENGTH = 16384;
+    private static final int PAGE_16MB = 16777216;
+
+    private static final char TOKEN_NULL = '\0';
+    private static final char TOKEN_CR = '\r';
+    private static final char TOKEN_LF = '\n';
+    private static final char TOKEN_QUOTE = '\"';
+    private static final char TOKEN_SEPARATOR = ',';
+
+    private ByteBuffer buffer = ByteBuffer.allocate(MAX_LINE_LENGTH);
+    private JsonObject headers = new JsonObject();
+    private Iterator<String> header;
+    private RandomAccessFile file;
+    private MappedByteBuffer map;
+    private long fileSize;
+    private int index = 0;
+    private int rows = 0;
+
+    @Override
+    public void setFileData(String localFileName, int offset, String fileName) throws FileNotFoundException {
+        file = new RandomAccessFile(localFileName, "rw");
+        try {
+            map = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, PAGE_16MB);
+            fileSize = file.length();
+            readRowCount();
+            readHeaders();
+        } catch (IOException e) {
+            throw new ParserException(e);
+        }
+    }
+
+    @Override
+    public Set<String> getSupportedFileExtensions() {
+        return new HashSet<>(Collections.singletonList(".csv"));
+    }
+
+    @Override
+    public void initialize() {
+        index = 0;
+        map.position(0);
+        readRow(); // skip headers row.
+        for (int i = 0; i < rows; i++) {
+            readRow();
+        }
+    }
+
+    private int readRowCount() {
+        for (int i = map.position(); i < fileSize; i++) {
+            if (map.get(i) == '\n') {
+                rows++;
+            }
+        }
+        return rows;
+    }
+
+    private void readHeaders() throws IOException {
+        map.position(0);
+
+        for (int i = map.position(); i < file.length(); i++) {
+            if (map.get(i) == '\n') {
+                Arrays.stream(new String(buffer.array()).split(","))
+                        .map(header -> header.replaceAll("\"", ""))
+                        .map(String::trim).forEach(header -> {
+                    headers.put(header, "<empty>");
+                });
+                break;
+            } else {
+                buffer.put(map.get(i));
+            }
+        }
+        buffer.clear();
+    }
+
+    private void process(AtomicInteger columnsRead, ByteBuffer buffer, JsonObject json) {
+        columnsRead.incrementAndGet();
+
+        if (columnsRead.get() > headers.size()) {
+            throw new ColumnsExceededHeadersException(columnsRead.get(), headers.size(), index + 1);
+        } else {
+            int read = buffer.position();
+            byte[] line = new byte[read + 1];
+
+            buffer.position(0);
+            buffer.get(line, 0, read);
+            line[line.length - 1] = '\0';
+
+            json.put(header.next(), parseDatatype(line));
+            buffer.clear();
+        }
+    }
+
+    private JsonObject readRow() {
+        // reset current header.
+        header = headers.fieldNames().iterator();
+
+        AtomicInteger columnsRead = new AtomicInteger(0);
+        JsonObject json = headers.copy();
+        boolean quoted = false;
+        boolean done = false;
+
+        while (!done) {
+            byte current = map.get();
+
+            switch (current) {
+                case TOKEN_NULL:
+                    // EOF call process.
+                    process(columnsRead, buffer, json);
+                    done = true;
+                    break;
+                case TOKEN_CR:
+                case TOKEN_LF:
+                    // final header is being read and EOL appears.
+                    if (columnsRead.get() == headers.size() - 1) {
+                        process(columnsRead, buffer, json);
+                        done = true;
+                        break;
+                    } else {
+                        // skip token if not all headers read.
+                        continue;
+                    }
+                case TOKEN_QUOTE:
+                    // toggle quoted to support commas within quotes.
+                    quoted = !quoted;
+                    break;
+                case TOKEN_SEPARATOR:
+                    if (!quoted) {
+                        process(columnsRead, buffer, json);
+                        break;
+                    }
+                default:
+                    // store the current token in the buffer until the column ends.
+                    buffer.put(current);
+            }
+        }
+
+        if (!(columnsRead.get() == headers.size())) {
+            throw new ParserException(
+                    String.format("Error at line %d, values (%d) does not match headers (%d).",
+                            index, columnsRead.get(), headers.size()));
+        } else {
+            index++;
+        }
+
+        // parse json object.
+        return json;
+    }
+
+    private Object parseDatatype(byte[] data) {
+        String line = new String(data).trim();
+
+        if (line.matches("[0-9]*")) {
+            return Integer.parseInt(line);
+        } else if (line.matches("true|false")) {
+            return Boolean.parseBoolean(line);
+        } else {
+            return line;
+        }
+    }
+
+    @Override
+    public int getNumberOfElements() {
+        return rows;
+    }
+
+    @Override
+    public void subscribe(Subscriber<? super JsonObject> subscriber) {
+        map.position(0);
+        readRow();
+        index = 0;
+
+        subscriber.onSubscribe(new Subscription() {
+            private boolean complete = false;
+            private int index = 0;
+
+            @Override
+            public void request(long count) {
+                for (int i = 0; i < count && i < rows; i++) {
+                    JsonObject result = readRow();
+
+                    if (result != null) {
+                        subscriber.onNext(result);
+                    } else {
+                        complete = true;
+                        subscriber.onComplete();
+                    }
+                }
+
+                index += count;
+
+                if (index >= rows && !complete) {
+                    subscriber.onComplete();
+                }
+            }
+
+            @Override
+            public void cancel() {
+                // send no more items!
+            }
+        });
+    }
+}
diff --git a/src/main/java/com/codingchili/Model/ColumnsExceededHeadersException.java b/src/main/java/com/codingchili/Model/ColumnsExceededHeadersException.java
@@ -0,0 +1,19 @@
+package com.codingchili.Model;
+
+/**
+ * @author Robin Duda
+ *
+ * Thrown when more columns are encountered than there is headers.
+ */
+public class ColumnsExceededHeadersException extends ParserException {
+
+    /**
+     * @param values number of values encountered
+     * @param headers the number of headers on the first row.
+     * @param index the line in the file.
+     */
+    public ColumnsExceededHeadersException(int values, int headers, int index) {
+        super(String.format("Encountered too many values (%d) on row %d, expected to match headers (%d).",
+                values, index, headers));
+    }
+}