Skip to content

Commit

Permalink
IMAGES: configure image parser support, without it, no EXIF header pa…
Browse files Browse the repository at this point in the history
…rsing happens.

PyLint: simple code cleanup
  • Loading branch information
mubaldino committed Mar 2, 2023
1 parent 115a860 commit 82d84a1
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 25 deletions.
1 change: 1 addition & 0 deletions etc/tika-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<parser class="org.apache.tika.parser.microsoft.rtf.RTFParser"></parser>
<parser class="org.apache.tika.parser.odf.OpenDocumentParser"></parser>
<parser class="org.apache.tika.parser.pdf.PDFParser"></parser>
<parser class="org.apache.tika.parser.image.JpegParser"></parser>

</parsers>
</properties>
11 changes: 8 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
</developer>
</developers>
<properties>
<slf4j.version>2.0.3</slf4j.version>
<tika.version>2.5.0</tika.version>
<log4j.version>2.19.0</log4j.version>
<slf4j.version>2.0.6</slf4j.version>
<tika.version>2.7.0</tika.version>
<log4j.version>2.20.0</log4j.version>
</properties>
<dependencyManagement>
<dependencies>
Expand Down Expand Up @@ -277,6 +277,11 @@
<artifactId>tika-parser-pdf-module</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-image-module</artifactId>
<version>${tika.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
Expand Down
5 changes: 2 additions & 3 deletions src/main/java/org/opensextant/xtext/ConvertedDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,8 @@ public void setDefaultID() throws IOException, NoSuchAlgorithmException {
* The whole point of this mess: get the text from the original. It is set here and line endings normalized to unix line endings, \n
*
* @param buf textual data for this document object
* @throws UnsupportedEncodingException on err
*/
public void setText(String buf) throws UnsupportedEncodingException {
public void setText(String buf) {
this.buffer = buf;

if (StringUtils.isBlank(buffer)) {
Expand All @@ -503,7 +502,7 @@ public void setText(String buf) throws UnsupportedEncodingException {
textpath = this.filepath;
}

addNumberProperty("textsize", (long) buffer.length());
addNumberProperty("textsize", buffer.length());
}

/**
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/opensextant/xtext/PathManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ public static String getEmbeddedPath(String container, String item) {
* @param filepath path to test
* @return true if file parent is "/xtext/" or "\xtext\, case sensitive is found anywhere in path.
*/
public static final boolean isXTextCache(String filepath) {
public static boolean isXTextCache(String filepath) {
if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_PATH)) {
return true;
}
Expand All @@ -627,7 +627,7 @@ public static final boolean isXTextCache(String filepath) {
* @param obj path to test.
* @return true if file parent is "xtext", case sensitive.
*/
public static final boolean isXTextCache(File obj) {
public static boolean isXTextCache(File obj) {
return DEFAULT_EMBED_FOLDER.equals(obj.getParentFile().getName());
}

Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/opensextant/xtext/XText.java
Original file line number Diff line number Diff line change
Expand Up @@ -687,9 +687,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw
* is trapped in loop
*
* @param input the input
* @throws IOException on err
*/
public void convertFolder(File input) throws IOException {
public void convertFolder(File input) {
java.util.Collection<File> files = FileUtils.listFiles(input,
new SuffixFileFilter(fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter());
for (File f : files) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
Expand All @@ -31,6 +32,9 @@
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

/**
* Default conversion is almost a pass through from Tika's auto parser and BodyContentHandler.
Expand All @@ -41,6 +45,8 @@
*/
public class DefaultConverter extends ConverterAdapter {

private final Logger log = LoggerFactory.getLogger(getClass());

/* 1 MB of text from a given document */
public final static int MAX_TEXT_SIZE = 0x100000;
private final Detector detector = new DefaultDetector();
Expand Down Expand Up @@ -77,8 +83,10 @@ protected ConvertedDocument conversionImplementation(InputStream input, java.io.
parser.parse(input, handler, metadata, ctx);
} catch (NoClassDefFoundError classErr) {
throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} catch (TikaException e1) {
throw new IOException("Tika: Unable to parse content", e1);
} catch (SAXException e2) {
throw new IOException("SAX: Unable to parse content", e2);
}
ConvertedDocument textdoc = new ConvertedDocument(doc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -58,12 +56,7 @@ public class ImageMetadataConverter extends ConverterAdapter {
private final Logger logger = LoggerFactory.getLogger(getClass());
private boolean emitMinimalText = true;

public final static String[] usefulFields = { "geo", "gps", "creation", "date", "model" };

private final static Set<String> usefulFieldsSet = new HashSet<>();
static {
usefulFieldsSet.addAll(Arrays.asList(usefulFields));
}
public final static String[] usefulFields = {"geo", "gps", "creation", "date", "model"};

public ImageMetadataConverter() {
ctx.set(Parser.class, parser);
Expand Down Expand Up @@ -131,6 +124,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)
StringBuilder buf = new StringBuilder();
BodyContentHandler handler = new BodyContentHandler();

//
String type = "Image";
String objName = null;
if (doc != null) {
Expand Down Expand Up @@ -188,7 +182,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)

// Location if available.
if (lat != null && lon != null) {
logger.info("Found a location LAT={} LON={}", lat, lon);
logger.debug("Found a location LAT={} LON={}", lat, lon);
// imgDoc.addProperty("location", String.format("%2.8f,%3.8f", ));
imgDoc.addUserProperty("location", String.format("%s, %s", lat, lon));
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ public class MessageConverter extends ConverterAdapter {
protected Logger logger = LoggerFactory.getLogger(getClass());
private final Session noSession = Session.getDefaultInstance(new Properties());
private int attachmentNumber = 0;
private final List<String> textEncodings = new LinkedList<>();
private Converter payloadConverter = new TikaHTMLConverter(false);

/**
Expand All @@ -91,7 +90,6 @@ public class MessageConverter extends ConverterAdapter {
protected ConvertedDocument conversionImplementation(InputStream in, File doc)
throws IOException {
attachmentNumber = 0;
textEncodings.clear();
try {
// Connect to the message file
MimeMessage msg = new MimeMessage(noSession, in);
Expand Down Expand Up @@ -268,8 +266,6 @@ public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder
try {

PartMetadata meta = new PartMetadata(bodyPart);
// String charset = (meta.charset == null ? "UTF-8" : meta.charset);
textEncodings.add(meta.charset);

String filename = bodyPart.getFileName();
String fileext = meta.getPossibleFileExtension();
Expand Down

0 comments on commit 82d84a1

Please sign in to comment.