Skip to content

Commit

Permalink
Merge branch 'release/0.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
fedelemantuano committed Aug 20, 2016
2 parents 3beaca0 + d3db6aa commit eb55df9
Show file tree
Hide file tree
Showing 7 changed files with 371 additions and 2 deletions.
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,35 @@ For detect **language**:
tika_client.detect_language("your_file")
```

For detect **language**:
For detect **all metadata and content**:

```
tika_client.extract_all_content("your_file")
```

For detect **only content**:

```
tika_client.extract_only_content("your_file")
```

If you want to use payload in base64, you can use the same methods with `payload` argument:

```
tika_client.detect_content_type(payload="base64_payload")
tika_client.detect_language(payload="base64_payload")
tika_client.extract_all_content(payload="base64_payload")
tika_client.extract_only_content(payload="base64_payload")
```

## Performance tests

These are the results of performance tests in [profiling](https://github.com/fedelemantuano/tika-app-python/tree/develop/profiling) folder:

```
tika_content_type() 0.708108 sec
tika_detect_language() 1.748900 sec
magic_content_type() 0.000215 sec
tika_extract_all_content() 0.849755 sec
tika_extract_only_content() 0.791735 sec
```
207 changes: 207 additions & 0 deletions profiling/lorem_ipsum.txt

Large diffs are not rendered by default.

Binary file added profiling/lorem_ipsum.txt.zip
Binary file not shown.
113 changes: 113 additions & 0 deletions profiling/performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from __future__ import unicode_literals
from tika_app.tika_app import TikaApp
import magic
import os
import timeit

profiling_path = os.path.realpath(os.path.dirname(__file__))
test_zip = os.path.join(profiling_path, "lorem_ipsum.txt.zip")
test_txt = os.path.join(profiling_path, "lorem_ipsum.txt")


def tika_content_type():
tika_client = TikaApp(
file_jar="/opt/tika/tika-app-1.13.jar"
)

output = tika_client.detect_content_type(
file_path=test_zip,
)

return output


def tika_detect_language():
tika_client = TikaApp(
file_jar="/opt/tika/tika-app-1.13.jar"
)

output = tika_client.detect_language(
file_path=test_zip,
)

return output


def magic_content_type():
mime = magic.Magic(mime=True)
output = mime.from_file(test_zip)
return output


def tika_extract_all_content(memory=None):
tika_client = TikaApp(
file_jar="/opt/tika/tika-app-1.13.jar",
memory_allocation=memory,
)

output = tika_client.extract_all_content(
file_path=test_zip,
)

return output


def tika_extract_only_content(memory=None):
tika_client = TikaApp(
file_jar="/opt/tika/tika-app-1.13.jar",
memory_allocation=memory,
)

output = tika_client.extract_only_content(
file_path=test_zip,
)

return output


if __name__ == "__main__":
"""Results:
tika_content_type() 0.708108 sec
tika_detect_language() 1.748900 sec
magic_content_type() 0.000215 sec
tika_extract_all_content() 0.849755 sec
tika_extract_only_content() 0.791735 sec
"""

repeats = 15
functions = [
"tika_content_type",
"tika_detect_language",
"magic_content_type",
"tika_extract_all_content",
"tika_extract_only_content",
]

for function in functions:
t = timeit.Timer(
"{0}()".format(function),
"from __main__ import {0}".format(function),
)
sec = t.timeit(repeats) / repeats

print("{function}()\t\t{sec:.6f} sec".format(**locals()))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='Tika App',
version='0.2',
version='0.3',
description='Python client for Apache Tika App',
author='Fedele Mantuano',
author_email='[email protected]',
Expand Down
22 changes: 22 additions & 0 deletions tika_app/tika_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,28 @@ def detect_content_type(self, file_path=None, payload=None):

return result

def extract_only_content(self, file_path=None, payload=None):
"""Return only the text content of passed file
Keyword arguments:
file_path -- Path of file
payload -- Payload base64 of file
"""

file_ = self._file_path(file_path, payload)

switches = [
"-t",
file_,
]

result = self._command_template(switches).strip()

if payload:
os.remove(file_)

return result

def detect_language(self, file_path=None, payload=None):
"""Return the language of passed file or payload.
Expand Down
8 changes: 8 additions & 0 deletions unittest/test_tika_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ def test_language(self):

self.assertEqual(result, "en")

def test_extract_only_content(self):
tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar")

result = tika_app.extract_only_content(file_path=test_txt)

self.assertIsInstance(result, str)
self.assertIn("test", result)


if __name__ == '__main__':
unittest.main()

0 comments on commit eb55df9

Please sign in to comment.