From 149bd1db7983929471719e95779c76b20619c2ea Mon Sep 17 00:00:00 2001 From: caimujia Date: Mon, 1 Jun 2015 11:03:10 +0800 Subject: [PATCH 1/3] Update __init__.py sloved unicode error in some charset like gb2312 etc. add gettitle getdate method and remove getimage. add segement load source from local file. --- src/boilerpipe/extract/__init__.py | 64 +++++++++++++++++++----------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py index c427209..e61c317 100644 --- a/src/boilerpipe/extract/__init__.py +++ b/src/boilerpipe/extract/__init__.py @@ -3,6 +3,7 @@ import socket import charade import threading +import re socket.setdefaulttimeout(15) lock = threading.Lock() @@ -18,7 +19,8 @@ class Extractor(object): being one of the boilerpipe extractors: - DefaultExtractor - ArticleExtractor - - ArticleSentencesExtractor + - ArticleSentencesExtractor file='/home/mj/t20150528_653893.htm' + - KeepEverythingExtractor - KeepEverythingWithMinKWordsExtractor - LargestContentExtractor @@ -29,7 +31,7 @@ class Extractor(object): source = None data = None headers = {'User-Agent': 'Mozilla/5.0'} - + def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) @@ -38,11 +40,34 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] - self.data = unicode(self.data, encoding) + # self.data = unicode(self.data, 'gbk') + #self.data = self.data.decode(encoding, 'ignore') + try: + self.data = unicode(self.data, charade.detect(self.data)['encoding']) + except UnicodeError: + encoding = charade.detect(self.data)['encoding'] + self.data = self.data.decode(encoding, 'ignore') elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): - self.data = unicode(self.data, charade.detect(self.data)['encoding']) + # self.data = unicode(self.data, charade.detect(self.data)['encoding']) + try: + self.data = unicode(self.data, charade.detect(self.data)['encoding']) + except UnicodeError: + encoding = charade.detect(self.data)['encoding'] + self.data = self.data.decode(encoding, 'ignore') + ## Extractor(extractor='ArticleExtractor',file='/tmp/a.html') + elif kwargs.get('file'): + Path = kwargs['file'] + f = open(Path, 'r') + self.data = f.read() + if not isinstance(self.data, unicode): + try: + self.data = unicode(self.data, charade.detect(self.data)['encoding']) + except UnicodeError: + encoding = charade.detect(self.data)['encoding'] + self.data = self.data.decode(encoding, 'ignore') + else: raise Exception('No text or url provided') @@ -52,35 +77,26 @@ def __init__(self, extractor='DefaultExtractor', **kwargs): if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() - + self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() - + reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source) - + def getText(self): return self.source.getContent() - + + def getTitle(self): + return self.source.getTitle() + def getHTML(self): highlighter = HTMLHighlighter.newExtractingInstance() return highlighter.process(self.source, self.data) - - def getImages(self): - extractor = jpype.JClass( - "de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE - images = extractor.process(self.source, self.data) - jpype.java.util.Collections.sort(images) - images = [ - { - 'src' : image.getSrc(), - 'width' : image.getWidth(), - 'height': image.getHeight(), - 'alt' : image.getAlt(), - 'area' : image.getArea() - } for image in images - ] - return images + + def getDate(self): + r='(19[7-9][0-9]|20[0-1][0-9])-(0[1-9]|1[0-2])-([1-2][0-9]|0[1-9]|3[0-1]) ([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9])' + return re.search(r,self.data).group() From 4969588893c800021272815cab053e64c88c01d8 Mon Sep 17 00:00:00 2001 From: caimujia Date: Mon, 1 Jun 2015 11:06:11 +0800 Subject: [PATCH 2/3] Update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 149408b..0a4569a 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,8 @@ def download_jars(datapath, version=boilerpipe_version): ], author='Misja Hoebe', author_email='misja.hoebe@gmail.com', - maintainer = 'Matthew Russell', - maintainer_email = 'ptwobrussell@gmail.com', + maintainer = 'Matthew Russell','CaiMujia' + maintainer_email = 'ptwobrussell@gmail.com','caimujia@gmail.com' url = 'https://github.com/ptwobrussell/python-boilerpipe/', classifiers=[ 'Development Status :: 5 - Production/Stable', From 83c63e5065c3a247f65221efd25071533d059f2f Mon Sep 17 00:00:00 2001 From: caimujia Date: Mon, 1 Jun 2015 11:13:31 +0800 Subject: [PATCH 3/3] Update README.rst --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index 09fe779..2b52cc4 100644 --- a/README.rst +++ b/README.rst @@ -6,6 +6,8 @@ python-boilerpipe A python wrapper for Boilerpipe_, an excellent Java library for boilerplate removal and fulltext extraction from HTML pages. + + Configuration ============= @@ -36,6 +38,8 @@ If no extractor is passed the DefaultExtractor will be used by default. Addition from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', url=your_url) + + ``you can chose 'url' ,'html' or 'file' as second argment. Then, to extract relevant content: @@ -46,3 +50,4 @@ Then, to extract relevant content: extracted_html = extractor.getHTML() .. _Boilerpipe: http://code.google.com/p/boilerpipe/ +