Skip to content

Commit

Permalink
Add support for parsing CSV files.
Browse files Browse the repository at this point in the history
  • Loading branch information
Robin Duda committed Oct 28, 2018
1 parent e30e468 commit 70343a4
Show file tree
Hide file tree
Showing 25 changed files with 913 additions and 493 deletions.
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ Tested with ElasticSearch 5.6.2 and 6.2.3.

Running the application, filename and index is required, to import from the terminal run:
```
java -Xmx2g -jar excelastic-1.2.7.jar <fileName> <indexName> --mapping mappingName --clear
java -Xmx2g -jar excelastic-1.3.0.jar <fileName> <indexName> --mapping mappingName --clear
```
If running with --clear, then the existing index will be cleared before the import starts.

To run with the web interface, run the following in your terminal:
```
java -Xmx2g -jar excelastic-1.2.7.jar
java -Xmx2g -jar excelastic-1.3.0.jar
```
When the application successfully connects to the ElasticSearch server, the browser will automatically open a new tab.

Expand Down Expand Up @@ -67,10 +67,7 @@ If no configuration file is present a new configuration file will be created usi

## Contributing

If you want to contribute to this project, open an issue or pull request. ::

In the 1.2.7 release we have cleaned up the code and added even more javadoc
in order to promote contributions! :astonished:
If you want to contribute to this project, open an issue or pull request. :heart_eyes_cat: :metal:

---

Expand Down
Binary file modified excelastic.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

<groupId>com.codingchili</groupId>
<artifactId>excelastic</artifactId>
<version>1.2.7</version>
<version>1.3.0</version>
<build>
<plugins>
<plugin>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/codingchili/ApplicationLauncher.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
*/
public class ApplicationLauncher {
private final ApplicationLogger logger = new ApplicationLogger(getClass());
public static String VERSION = "1.2.7";
public static String VERSION = "1.3.0";
private Vertx vertx;

public static void main(String[] args) {
Expand Down
6 changes: 4 additions & 2 deletions src/main/java/com/codingchili/Controller/CommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@ private void importFile(ImportEvent event, String fileName) {
logger.loadingFromFilesystem(fileName);
logger.parsingStarted();
try {
FileParser parser = new FileParser(new File(fileName), 1, fileName);
FileParser parser = ParserFactory.getByFilename(fileName);
parser.setFileData(fileName, 1, fileName);

event.setParser(parser);
parser.assertFileParsable();
parser.initialize();

logger.importStarted(event.getIndex());
vertx.eventBus().send(Configuration.INDEXING_ELASTICSEARCH, event, getDeliveryOpts(),
Expand Down
8 changes: 5 additions & 3 deletions src/main/java/com/codingchili/Controller/Website.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import static com.codingchili.ApplicationLauncher.VERSION;
import static com.codingchili.Model.Configuration.INDEXING_ELASTICSEARCH;
import static com.codingchili.Model.ElasticWriter.*;
import static com.codingchili.Model.FileParser.INDEX;
import static com.codingchili.Model.ExcelParser.INDEX;

/**
* @author Robin Duda
Expand Down Expand Up @@ -172,8 +172,10 @@ private void parse(String uploadedFileName, MultiMap params, String fileName, Fu
vertx.executeBlocking(blocking -> {
try {
ImportEvent event = ImportEvent.fromParams(params);
FileParser parser = new FileParser(new File(uploadedFileName), event.getOffset(), fileName);
parser.assertFileParsable();
FileParser parser = ParserFactory.getByFilename(fileName);
parser.setFileData(uploadedFileName, event.getOffset(), fileName);

parser.initialize();
event.setParser(parser);

// submit an import event.
Expand Down
220 changes: 220 additions & 0 deletions src/main/java/com/codingchili/Model/CSVParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
package com.codingchili.Model;

import io.vertx.core.json.JsonObject;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
* @author Robin Duda
* <p>
* Parses CSV files.
*/
public class CSVParser implements FileParser {
private static final int MAX_LINE_LENGTH = 16384;
private static final int PAGE_16MB = 16777216;

private static final char TOKEN_NULL = '\0';
private static final char TOKEN_CR = '\r';
private static final char TOKEN_LF = '\n';
private static final char TOKEN_QUOTE = '\"';
private static final char TOKEN_SEPARATOR = ',';

private ByteBuffer buffer = ByteBuffer.allocate(MAX_LINE_LENGTH);
private JsonObject headers = new JsonObject();
private Iterator<String> header;
private RandomAccessFile file;
private MappedByteBuffer map;
private long fileSize;
private int index = 0;
private int rows = 0;

@Override
public void setFileData(String localFileName, int offset, String fileName) throws FileNotFoundException {
file = new RandomAccessFile(localFileName, "rw");
try {
map = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, PAGE_16MB);
fileSize = file.length();
readRowCount();
readHeaders();
} catch (IOException e) {
throw new ParserException(e);
}
}

@Override
public Set<String> getSupportedFileExtensions() {
return new HashSet<>(Collections.singletonList(".csv"));
}

@Override
public void initialize() {
index = 0;
map.position(0);
readRow(); // skip headers row.
for (int i = 0; i < rows; i++) {
readRow();
}
}

private int readRowCount() {
for (int i = map.position(); i < fileSize; i++) {
if (map.get(i) == '\n') {
rows++;
}
}
return rows;
}

private void readHeaders() throws IOException {
map.position(0);

for (int i = map.position(); i < file.length(); i++) {
if (map.get(i) == '\n') {
Arrays.stream(new String(buffer.array()).split(","))
.map(header -> header.replaceAll("\"", ""))
.map(String::trim).forEach(header -> {
headers.put(header, "<empty>");
});
break;
} else {
buffer.put(map.get(i));
}
}
buffer.clear();
}

private void process(AtomicInteger columnsRead, ByteBuffer buffer, JsonObject json) {
columnsRead.incrementAndGet();

if (columnsRead.get() > headers.size()) {
throw new ColumnsExceededHeadersException(columnsRead.get(), headers.size(), index + 1);
} else {
int read = buffer.position();
byte[] line = new byte[read + 1];

buffer.position(0);
buffer.get(line, 0, read);
line[line.length - 1] = '\0';

json.put(header.next(), parseDatatype(line));
buffer.clear();
}
}

private JsonObject readRow() {
// reset current header.
header = headers.fieldNames().iterator();

AtomicInteger columnsRead = new AtomicInteger(0);
JsonObject json = headers.copy();
boolean quoted = false;
boolean done = false;

while (!done) {
byte current = map.get();

switch (current) {
case TOKEN_NULL:
// EOF call process.
process(columnsRead, buffer, json);
done = true;
break;
case TOKEN_CR:
case TOKEN_LF:
// final header is being read and EOL appears.
if (columnsRead.get() == headers.size() - 1) {
process(columnsRead, buffer, json);
done = true;
break;
} else {
// skip token if not all headers read.
continue;
}
case TOKEN_QUOTE:
// toggle quoted to support commas within quotes.
quoted = !quoted;
break;
case TOKEN_SEPARATOR:
if (!quoted) {
process(columnsRead, buffer, json);
break;
}
default:
// store the current token in the buffer until the column ends.
buffer.put(current);
}
}

if (!(columnsRead.get() == headers.size())) {
throw new ParserException(
String.format("Error at line %d, values (%d) does not match headers (%d).",
index, columnsRead.get(), headers.size()));
} else {
index++;
}

// parse json object.
return json;
}

private Object parseDatatype(byte[] data) {
String line = new String(data).trim();

if (line.matches("[0-9]*")) {
return Integer.parseInt(line);
} else if (line.matches("true|false")) {
return Boolean.parseBoolean(line);
} else {
return line;
}
}

@Override
public int getNumberOfElements() {
return rows;
}

@Override
public void subscribe(Subscriber<? super JsonObject> subscriber) {
map.position(0);
readRow();
index = 0;

subscriber.onSubscribe(new Subscription() {
private boolean complete = false;
private int index = 0;

@Override
public void request(long count) {
for (int i = 0; i < count && i < rows; i++) {
JsonObject result = readRow();

if (result != null) {
subscriber.onNext(result);
} else {
complete = true;
subscriber.onComplete();
}
}

index += count;

if (index >= rows && !complete) {
subscriber.onComplete();
}
}

@Override
public void cancel() {
// send no more items!
}
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package com.codingchili.Model;

/**
* @author Robin Duda
*
* Thrown when more columns are encountered than there is headers.
*/
public class ColumnsExceededHeadersException extends ParserException {

/**
* @param values number of values encountered
* @param headers the number of headers on the first row.
* @param index the line in the file.
*/
public ColumnsExceededHeadersException(int values, int headers, int index) {
super(String.format("Encountered too many values (%d) on row %d, expected to match headers (%d).",
values, index, headers));
}
}
Loading

0 comments on commit 70343a4

Please sign in to comment.