From 7df45c306c3d1d81bb5b85034c77b907250b2bcf Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Fri, 9 Sep 2016 00:02:02 +0200 Subject: [PATCH 1/5] Fix url tests in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9bff11c..431baaf 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ tika_client.extract_only_content(payload="base64_payload") ## Performance tests -These are the results of performance tests in [profiling](https://github.com/fedelemantuano/tika-app-python/tree/develop/profiling) folder: +These are the results of performance tests in [tests](https://github.com/fedelemantuano/tika-app-python/tree/develop/tests) folder: ``` tika_content_type() 0.708108 sec From 5f6edecaf7bbfcf32f2ca8d95a5bfcb11a370d95 Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Sat, 10 Sep 2016 01:11:26 +0200 Subject: [PATCH 2/5] Added README for pypi --- MANIFEST | 1 + README | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 32 +++++++++++++++-- 3 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 README diff --git a/MANIFEST b/MANIFEST index 6a8ccb9..3c74113 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,4 +1,5 @@ # file GENERATED by distutils, do NOT edit +README setup.cfg setup.py tikapp/__init__.py diff --git a/README b/README new file mode 100644 index 0000000..b48b000 --- /dev/null +++ b/README @@ -0,0 +1,106 @@ +tika-app-python +=============== + +Overview +-------- + +tika-app-python is a wrapper for `Apache Tika App`_. + +Apache 2 Open Source License +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +tika-app-python can be downloaded, used, and modified free of charge. It +is available under the Apache 2 license. + +Authors +------- + +Main Author +~~~~~~~~~~~ + +Fedele Mantuano (**Twitter**: +[@fedelemantuano](https://twitter.com/fedelemantuano)) + +Installation +------------ + +Clone repository + +:: + + git clone https://github.com/fedelemantuano/tika-app-python.git + +and install tika-app-python with ``setup.py``: + +:: + + cd tika-app-python + + python setup.py install + +or use ``pip``: + +:: + + pip install tika-app + +Usage +----- + +Import ``TikaApp`` class: + +:: + + from tikapp import TikaApp + + tika_client = TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + +For get **content type**: + +:: + + tika_client.detect_content_type("your_file") + +For detect **language**: + +:: + + tika_client.detect_language("your_file") + +For detect **all metadata and content**: + +:: + + tika_client.extract_all_content("your_file") + +For detect **only content**: + +:: + + tika_client.extract_only_content("your_file") + +If you want to use payload in base64, you can use the same methods with +``payload`` argument: + +:: + + tika_client.detect_content_type(payload="base64_payload") + tika_client.detect_language(payload="base64_payload") + tika_client.extract_all_content(payload="base64_payload") + tika_client.extract_only_content(payload="base64_payload") + +Performance tests +----------------- + +These are the results of performance tests in `tests`_ folder: + +:: + + tika_content_type() 0.708108 sec + tika_detect_language() 1.748900 sec + magic_content_type() 0.000215 sec + tika_extract_all_content() 0.849755 sec + tika_extract_only_content() 0.791735 sec + +.. _Apache Tika App: https://tika.apache.org/ +.. _tests: https://github.com/fedelemantuano/tika-app-python/tree/develop/tests diff --git a/setup.py b/setup.py index c9d6f75..61637e4 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,46 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +from os.path import join, dirname from distutils.core import setup + +VERSION = (0, 4, 0) +__version__ = VERSION +__versionstr__ = '.'.join(map(str, VERSION)) + +f = open(join(dirname(__file__), 'README')) +long_description = f.read().strip() +f.close() + +requires = ['simplejson'] + + setup( name='tika-app', - version='0.4', + version=__versionstr__, description='Python client for Apache Tika App', author='Fedele Mantuano', author_email='mantuano.fedele@gmail.com', maintainer='Fedele Mantuano', maintainer_email='mantuano.fedele@gmail.com', url='https://github.com/fedelemantuano/tika-app-python', + long_description=long_description, keywords=['tika', 'apache', 'toolkit'], - requires=['simplejson'], + requires=requires, license="Apache License, Version 2.0", packages=['tikapp'], + classifiers=[ + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.2", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + ], ) From bb296bdb6adbcbdf8b4d47e76b7e7d2ac533e8a3 Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Wed, 9 Nov 2016 22:06:43 +0100 Subject: [PATCH 3/5] Added VERSION. Fix relative import --- tikapp/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tikapp/__init__.py b/tikapp/__init__.py index e7cfa16..c5f6928 100644 --- a/tikapp/__init__.py +++ b/tikapp/__init__.py @@ -22,7 +22,7 @@ import os import tempfile from subprocess import Popen, PIPE, STDOUT -from exceptions import \ +from .exceptions import \ InvalidTikaAppJar, \ InvalidSwitches, \ InvalidFilePath, \ @@ -36,6 +36,10 @@ log = logging.getLogger(__name__) +VERSION = (0, 4, 0) +__version__ = VERSION +__versionstr__ = '.'.join(map(str, VERSION)) + class TikaApp(object): From a0859bd4b3cabde7101bbc02b479e4946918b0f9 Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Thu, 10 Nov 2016 23:54:32 +0100 Subject: [PATCH 4/5] Added command line swithes --- MANIFEST | 6 -- MANIFEST.in | 1 + README | 39 +++++++++++ README.md | 40 +++++++++++- requirements.txt | 3 +- setup.py | 48 ++++++++------ tests/test_tika_app.py | 91 ++++++++------------------ tikapp/__init__.py | 27 +++++--- tikapp/__main__.py | 142 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 297 insertions(+), 100 deletions(-) delete mode 100644 MANIFEST create mode 100644 MANIFEST.in create mode 100755 tikapp/__main__.py diff --git a/MANIFEST b/MANIFEST deleted file mode 100644 index 3c74113..0000000 --- a/MANIFEST +++ /dev/null @@ -1,6 +0,0 @@ -# file GENERATED by distutils, do NOT edit -README -setup.cfg -setup.py -tikapp/__init__.py -tikapp/exceptions.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f9bd145 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt diff --git a/README b/README index b48b000..47af015 100644 --- a/README +++ b/README @@ -89,6 +89,45 @@ If you want to use payload in base64, you can use the same methods with tika_client.extract_all_content(payload="base64_payload") tika_client.extract_only_content(payload="base64_payload") +Usage from command-line +----------------------- + +If you installed tika-app-python with ``pip`` or ``setup.py`` you can +use it with command-line. To use tika-app-python you should submit the +Apache Tika app JAR. You can: - leave the default value: +``/opt/tika/tika-app-1.13.jar`` - set the enviroment value +``TIKA_APP_JAR`` - use ``--jar`` switch + +The last one overwrite all the others. + +These are all swithes: + +:: + + usage: tikapp [-h] (-f FILE | -p PAYLOAD) [-j JAR] [-d] [-t] [-l] [-a] + [-v] + + Wrapper for Apache Tika App. + + optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE File to submit (default: None) + -p PAYLOAD, --payload PAYLOAD + Base64 payload to submit (default: None) + -j JAR, --jar JAR Apache Tika app JAR (default: None) + -d, --detect Detect document type (default: False) + -t, --text Output plain text content (default: False) + -l, --language Output only language (default: False) + -a, --all Output metadata and content from all embedded files + (default: False) + -v, --version show program's version number and exit + +Example: + +.. code:: shell + + $ tikapp -f example_file -a + Performance tests ----------------- diff --git a/README.md b/README.md index 431baaf..c0a7dc1 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ or use `pip`: pip install tika-app ``` -## Usage +## Usage in a project Import `TikaApp` class: @@ -79,6 +79,44 @@ tika_client.extract_all_content(payload="base64_payload") tika_client.extract_only_content(payload="base64_payload") ``` +## Usage from command-line + +If you installed tika-app-python with `pip` or `setup.py` you can use it with command-line. +To use tika-app-python you should submit the Apache Tika app JAR. You can: + - leave the default value: `/opt/tika/tika-app-1.13.jar` + - set the enviroment value `TIKA_APP_JAR` + - use `--jar` switch + +The last one overwrite all the others. + +These are all swithes: + +``` +usage: tikapp [-h] (-f FILE | -p PAYLOAD) [-j JAR] [-d] [-t] [-l] [-a] + [-v] + +Wrapper for Apache Tika App. + +optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE File to submit (default: None) + -p PAYLOAD, --payload PAYLOAD + Base64 payload to submit (default: None) + -j JAR, --jar JAR Apache Tika app JAR (default: None) + -d, --detect Detect document type (default: False) + -t, --text Output plain text content (default: False) + -l, --language Output only language (default: False) + -a, --all Output metadata and content from all embedded files + (default: False) + -v, --version show program's version number and exit +``` + +Example: + +```shell +$ tikapp -f example_file -a +``` + ## Performance tests These are the results of performance tests in [tests](https://github.com/fedelemantuano/tika-app-python/tree/develop/tests) folder: diff --git a/requirements.txt b/requirements.txt index c457133..81ce69e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ +chainmap==1.0.2 python-magic==0.4.12 -simplejson==3.8.2 +simplejson==3.10.0 diff --git a/setup.py b/setup.py index 61637e4..dfed53d 100644 --- a/setup.py +++ b/setup.py @@ -1,35 +1,46 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from os.path import join, dirname -from distutils.core import setup +""" +Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" -VERSION = (0, 4, 0) -__version__ = VERSION -__versionstr__ = '.'.join(map(str, VERSION)) +from os.path import join, dirname +from distutils.core import setup +from tikapp import __versionstr__ -f = open(join(dirname(__file__), 'README')) -long_description = f.read().strip() -f.close() -requires = ['simplejson'] +long_description = open(join(dirname(__file__), 'README')).read().strip() +requires = open(join(dirname(__file__), + 'requirements.txt')).read().splitlines() setup( name='tika-app', - version=__versionstr__, description='Python client for Apache Tika App', + license="Apache License, Version 2.0", + url='https://github.com/fedelemantuano/tika-app-python', + long_description=long_description, + version=__versionstr__, author='Fedele Mantuano', author_email='mantuano.fedele@gmail.com', maintainer='Fedele Mantuano', maintainer_email='mantuano.fedele@gmail.com', - url='https://github.com/fedelemantuano/tika-app-python', - long_description=long_description, - keywords=['tika', 'apache', 'toolkit'], - requires=requires, - license="Apache License, Version 2.0", packages=['tikapp'], + platforms=["Linux", ], + keywords=['tika', 'apache', 'toolkit'], classifiers=[ "License :: OSI Approved :: Apache Software License", "Intended Audience :: Developers", @@ -38,9 +49,8 @@ "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", ], + install_requires=requires, + entry_points={'console_scripts': [ + 'tikapp = tikapp.__main__:main']}, ) diff --git a/tests/test_tika_app.py b/tests/test_tika_app.py index 89e1eeb..d871928 100644 --- a/tests/test_tika_app.py +++ b/tests/test_tika_app.py @@ -40,38 +40,30 @@ class TestTikaApp(unittest.TestCase): def test_invalid_tika_app_jar(self): self.assertRaises( tika.InvalidTikaAppJar, - tika.TikaApp, - ) + tika.TikaApp) def test_invalid_switches(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with self.assertRaises(tika.InvalidSwitches): tika_app.generic("--help") def test_generic(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - self.assertIsInstance( - tika_app.generic(), - str, - ) + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + self.assertIsInstance(tika_app.generic(), str) def test_invalid_parameters(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with self.assertRaises(tika.InvalidParameters): - tika_app.extract_all_content( - file_path=None, - payload=None, - ) + tika_app.extract_all_content(file_path=None, payload=None) with self.assertRaises(tika.InvalidParameters): - tika_app.extract_all_content( - file_path=True, - payload=True, - ) + tika_app.extract_all_content(file_path=True, payload=True) def test_extract_content_from_file(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + + self.assertEqual("/opt/tika/tika-app-1.13.jar", tika_app.file_jar) result = tika_app.extract_all_content(test_zip) self.assertIsInstance(result, str) @@ -82,43 +74,26 @@ def test_extract_content_from_file(self): self.assertEqual(result_obj[0]["Content-Type"], "application/zip") self.assertEqual( result_obj[1]["Content-Type"], - "text/plain; charset=ISO-8859-1" - ) - self.assertEqual( - result_obj[0]["resourceName"], - "test.zip" - ) - self.assertEqual( - result_obj[1]["resourceName"], - "test.txt" - ) + "text/plain; charset=ISO-8859-1") + self.assertEqual(result_obj[0]["resourceName"], "test.zip") + self.assertEqual(result_obj[1]["resourceName"], "test.txt") def test_extract_content_obj(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result_obj = tika_app.extract_all_content( - file_path=test_zip, - convert_to_obj=True, - ) + file_path=test_zip, convert_to_obj=True) self.assertIsInstance(result_obj, list) self.assertEqual(len(result_obj), 2) self.assertEqual(result_obj[0]["Content-Type"], "application/zip") - self.assertEqual( - result_obj[1]["Content-Type"], - "text/plain; charset=ISO-8859-1" - ) - self.assertEqual( - result_obj[0]["resourceName"], - "test.zip" - ) - self.assertEqual( - result_obj[1]["resourceName"], - "test.txt" - ) + self.assertEqual(result_obj[1]["Content-Type"], + "text/plain; charset=ISO-8859-1") + self.assertEqual(result_obj[0]["resourceName"], "test.zip") + self.assertEqual(result_obj[1]["resourceName"], "test.txt") def test_extract_content_from_buffer(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with open(test_zip, 'rb') as f: payload = f.read().encode("base64") @@ -132,33 +107,23 @@ def test_extract_content_from_buffer(self): result_file_obj = json.loads(result_file) result_payload_obj = json.loads(result_payload) - self.assertEqual( - result_file_obj[0]["Content-Type"], - result_payload_obj[0]["Content-Type"], - ) + self.assertEqual(result_file_obj[0]["Content-Type"], + result_payload_obj[0]["Content-Type"]) - self.assertEqual( - result_file_obj[1]["Content-Type"], - result_payload_obj[1]["Content-Type"], - ) + self.assertEqual(result_file_obj[1]["Content-Type"], + result_payload_obj[1]["Content-Type"]) - self.assertEqual( - result_file_obj[1]["resourceName"], - result_payload_obj[1]["resourceName"], - ) + self.assertEqual(result_file_obj[1]["resourceName"], + result_payload_obj[1]["resourceName"]) def test_language(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result = tika_app.detect_language(file_path=test_txt) - self.assertEqual(result, "en") def test_extract_only_content(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result = tika_app.extract_only_content(file_path=test_txt) - self.assertIsInstance(result, str) self.assertIn("test", result) diff --git a/tikapp/__init__.py b/tikapp/__init__.py index c5f6928..c2e9acf 100644 --- a/tikapp/__init__.py +++ b/tikapp/__init__.py @@ -46,14 +46,11 @@ class TikaApp(object): def __init__( self, file_jar=None, - memory_allocation=None, + memory_allocation=None ): - if not file_jar or not os.path.exists(file_jar): - log.exception("Invalid Tika app jar") - raise InvalidTikaAppJar("Invalid Tika app jar") - self._file_jar = file_jar - self._memory_allocation = memory_allocation + self.file_jar = file_jar + self.memory_allocation = memory_allocation def _write_payload(self, payload): """Write a base64 payload on temp file @@ -85,11 +82,9 @@ def _file_path(self, file_path=None, payload=None): file_ = file_path else: log.exception( - "Invalid parameters: you must pass file_path or payload" - ) + "Invalid parameters: you must pass file_path or payload") raise InvalidParameters( - "Invalid parameters: you must pass file_path or payload" - ) + "Invalid parameters: you must pass file_path or payload") if not os.path.exists(file_): log.exception("File {} does not exist".format(file_)) @@ -135,10 +130,22 @@ def _command_template(self, switches): def file_jar(self): return self._file_jar + @file_jar.setter + def file_jar(self, value): + if not value or not os.path.exists(value): + log.exception("Invalid Tika app jar") + raise InvalidTikaAppJar("Invalid Tika app jar") + + self._file_jar = value + @property def memory_allocation(self): return self._memory_allocation + @memory_allocation.setter + def memory_allocation(self, value): + self._memory_allocation = value + @property def help(self): return self._command_template(["--help"]) diff --git a/tikapp/__main__.py b/tikapp/__main__.py new file mode 100755 index 0000000..bf309bd --- /dev/null +++ b/tikapp/__main__.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys + +try: + from collections import ChainMap +except ImportError: + from chainmap import ChainMap + +current = os.path.realpath(os.path.dirname(__file__)) +root = os.path.join(current, '..') +sys.path.append(root) + +from tikapp import TikaApp, __versionstr__ + + +def get_args(): + parser = argparse.ArgumentParser( + description="Wrapper for Apache Tika App.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parsing_group = parser.add_mutually_exclusive_group(required=True) + parsing_group.add_argument( + "-f", + "--file", + dest="file", + help="File to submit") + parsing_group.add_argument( + "-p", + "--payload", + dest="payload", + help="Base64 payload to submit") + + parser.add_argument( + "-j", + "--jar", + dest="jar", + help="Apache Tika app JAR") + + parser.add_argument( + "-d", + "--detect", + dest="detect", + action="store_true", + help="Detect document type") + + parser.add_argument( + "-t", + "--text", + dest="text", + action="store_true", + help="Output plain text content") + + parser.add_argument( + "-l", + "--language", + dest="language", + action="store_true", + help="Output only language") + + parser.add_argument( + "-a", + "--all", + dest="all", + action="store_true", + help="Output metadata and content from all embedded files") + + parser.add_argument( + '-v', + '--version', + action='version', + version='%(prog)s {}'.format(__versionstr__)) + + return parser.parse_args() + + +def main(): + args = get_args() + + command_line = dict() + if args.jar: + command_line = {"TIKA_APP_JAR": args.jar} + + defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.13.jar"} + options = ChainMap(command_line, os.environ, defaults) + + tika = TikaApp(options['TIKA_APP_JAR']) + + if args.file: + f = args.file + + if args.detect: + print(tika.detect_content_type(file_path=f).encode('utf-8')) + + if args.text: + print(tika.extract_only_content(file_path=f).encode('utf-8')) + + if args.language: + print(tika.detect_language(file_path=f).encode('utf-8')) + + if args.all: + print(tika.extract_all_content( + file_path=f, pretty_print=True).encode('utf-8')) + + elif args.payload: + p = args.payload + + if args.detect: + print(tika.detect_content_type(payload=p).encode('utf-8')) + + if args.text: + print(tika.extract_only_content(payload=p).encode('utf-8')) + + if args.language: + print(tika.detect_language(payload=p).encode('utf-8')) + + if args.all: + print(tika.extract_all_content( + payload=p, pretty_print=True).encode('utf-8')) + + +if __name__ == '__main__': + main() From 362711584d89c4b4ef9907e42a6b70803ba8329e Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Thu, 10 Nov 2016 23:58:24 +0100 Subject: [PATCH 5/5] New version --- tikapp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tikapp/__init__.py b/tikapp/__init__.py index c2e9acf..fa203c2 100644 --- a/tikapp/__init__.py +++ b/tikapp/__init__.py @@ -36,7 +36,7 @@ log = logging.getLogger(__name__) -VERSION = (0, 4, 0) +VERSION = (0, 5, 0) __version__ = VERSION __versionstr__ = '.'.join(map(str, VERSION))