Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caimany #32

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ python-boilerpipe

A python wrapper for Boilerpipe_, an excellent Java library for boilerplate removal and fulltext extraction from HTML pages.



Configuration
=============

Expand Down Expand Up @@ -36,6 +38,8 @@ If no extractor is passed the DefaultExtractor will be used by default. Addition

from boilerpipe.extract import Extractor
extractor = Extractor(extractor='ArticleExtractor', url=your_url)

``you can chose 'url' ,'html' or 'file' as second argment.

Then, to extract relevant content:

Expand All @@ -46,3 +50,4 @@ Then, to extract relevant content:
extracted_html = extractor.getHTML()

.. _Boilerpipe: http://code.google.com/p/boilerpipe/

4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def download_jars(datapath, version=boilerpipe_version):
],
author='Misja Hoebe',
author_email='[email protected]',
maintainer = 'Matthew Russell',
maintainer_email = '[email protected]',
maintainer = 'Matthew Russell','CaiMujia'
maintainer_email = '[email protected]','[email protected]'
url = 'https://github.com/ptwobrussell/python-boilerpipe/',
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down
64 changes: 40 additions & 24 deletions src/boilerpipe/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import socket
import charade
import threading
import re

socket.setdefaulttimeout(15)
lock = threading.Lock()
Expand All @@ -18,7 +19,8 @@ class Extractor(object):
being one of the boilerpipe extractors:
- DefaultExtractor
- ArticleExtractor
- ArticleSentencesExtractor
- ArticleSentencesExtractor file='/home/mj/t20150528_653893.htm'

- KeepEverythingExtractor
- KeepEverythingWithMinKWordsExtractor
- LargestContentExtractor
Expand All @@ -29,7 +31,7 @@ class Extractor(object):
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}

def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
request = urllib2.Request(kwargs['url'], headers=self.headers)
Expand All @@ -38,11 +40,34 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
encoding = connection.headers['content-type'].lower().split('charset=')[-1]
if encoding.lower() == 'text/html':
encoding = charade.detect(self.data)['encoding']
self.data = unicode(self.data, encoding)
# self.data = unicode(self.data, 'gbk')
#self.data = self.data.decode(encoding, 'ignore')
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
# self.data = unicode(self.data, charade.detect(self.data)['encoding'])
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
## Extractor(extractor='ArticleExtractor',file='/tmp/a.html')
elif kwargs.get('file'):
Path = kwargs['file']
f = open(Path, 'r')
self.data = f.read()
if not isinstance(self.data, unicode):
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')

else:
raise Exception('No text or url provided')

Expand All @@ -52,35 +77,26 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
if jpype.isThreadAttachedToJVM() == False:
jpype.attachThreadToJVM()
lock.acquire()

self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors."+extractor).INSTANCE
finally:
lock.release()

reader = StringReader(self.data)
self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(self.source)

def getText(self):
return self.source.getContent()


def getTitle(self):
return self.source.getTitle()

def getHTML(self):
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(self.source, self.data)

def getImages(self):
extractor = jpype.JClass(
"de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
images = extractor.process(self.source, self.data)
jpype.java.util.Collections.sort(images)
images = [
{
'src' : image.getSrc(),
'width' : image.getWidth(),
'height': image.getHeight(),
'alt' : image.getAlt(),
'area' : image.getArea()
} for image in images
]
return images

def getDate(self):
r='(19[7-9][0-9]|20[0-1][0-9])-(0[1-9]|1[0-2])-([1-2][0-9]|0[1-9]|3[0-1]) ([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9])'
return re.search(r,self.data).group()