From 149bd1db7983929471719e95779c76b20619c2ea Mon Sep 17 00:00:00 2001
From: caimujia <caimujia@gmail.com>
Date: Mon, 1 Jun 2015 11:03:10 +0800
Subject: [PATCH 1/3] Update __init__.py

sloved unicode error in some charset like gb2312 etc.
add gettitle  getdate method and remove getimage.
add segement load source from local file.
---
 src/boilerpipe/extract/__init__.py | 64 +++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py
index c427209..e61c317 100644
--- a/src/boilerpipe/extract/__init__.py
+++ b/src/boilerpipe/extract/__init__.py
@@ -3,6 +3,7 @@
 import socket
 import charade
 import threading
+import re
 
 socket.setdefaulttimeout(15)
 lock = threading.Lock()
@@ -18,7 +19,8 @@ class Extractor(object):
     being one of the boilerpipe extractors:
     - DefaultExtractor
     - ArticleExtractor
-    - ArticleSentencesExtractor
+    - ArticleSentencesExtractor                      file='/home/mj/t20150528_653893.htm'
+
     - KeepEverythingExtractor
     - KeepEverythingWithMinKWordsExtractor
     - LargestContentExtractor
@@ -29,7 +31,7 @@ class Extractor(object):
     source    = None
     data      = None
     headers   = {'User-Agent': 'Mozilla/5.0'}
-    
+
     def __init__(self, extractor='DefaultExtractor', **kwargs):
         if kwargs.get('url'):
             request     = urllib2.Request(kwargs['url'], headers=self.headers)
@@ -38,11 +40,34 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
             encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
             if encoding.lower() == 'text/html':
                 encoding = charade.detect(self.data)['encoding']
-            self.data = unicode(self.data, encoding)
+            # self.data = unicode(self.data, 'gbk')
+            #self.data = self.data.decode(encoding, 'ignore')
+            try:
+                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
+            except UnicodeError:
+                encoding = charade.detect(self.data)['encoding']
+                self.data = self.data.decode(encoding, 'ignore')
         elif kwargs.get('html'):
             self.data = kwargs['html']
             if not isinstance(self.data, unicode):
-                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
+                # self.data = unicode(self.data, charade.detect(self.data)['encoding'])
+                try:
+                    self.data = unicode(self.data, charade.detect(self.data)['encoding'])
+                except UnicodeError:
+                    encoding = charade.detect(self.data)['encoding']
+                    self.data = self.data.decode(encoding, 'ignore')
+        ## Extractor(extractor='ArticleExtractor',file='/tmp/a.html')
+        elif kwargs.get('file'):
+            Path = kwargs['file']
+            f = open(Path, 'r')
+            self.data = f.read()
+            if not isinstance(self.data, unicode):
+                try:
+                    self.data = unicode(self.data, charade.detect(self.data)['encoding'])
+                except UnicodeError:
+                    encoding = charade.detect(self.data)['encoding']
+                    self.data = self.data.decode(encoding, 'ignore')
+
         else:
             raise Exception('No text or url provided')
 
@@ -52,35 +77,26 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
                 if jpype.isThreadAttachedToJVM() == False:
                     jpype.attachThreadToJVM()
             lock.acquire()
-            
+
             self.extractor = jpype.JClass(
                 "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
         finally:
             lock.release()
-    
+
         reader = StringReader(self.data)
         self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
         self.extractor.process(self.source)
-    
+
     def getText(self):
         return self.source.getContent()
-    
+
+    def getTitle(self):
+        return self.source.getTitle()
+
     def getHTML(self):
         highlighter = HTMLHighlighter.newExtractingInstance()
         return highlighter.process(self.source, self.data)
-    
-    def getImages(self):
-        extractor = jpype.JClass(
-            "de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
-        images = extractor.process(self.source, self.data)
-        jpype.java.util.Collections.sort(images)
-        images = [
-            {
-                'src'   : image.getSrc(),
-                'width' : image.getWidth(),
-                'height': image.getHeight(),
-                'alt'   : image.getAlt(),
-                'area'  : image.getArea()
-            } for image in images
-        ]
-        return images
+
+    def getDate(self):
+        r='(19[7-9][0-9]|20[0-1][0-9])-(0[1-9]|1[0-2])-([1-2][0-9]|0[1-9]|3[0-1]) ([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9])'
+        return re.search(r,self.data).group()

From 4969588893c800021272815cab053e64c88c01d8 Mon Sep 17 00:00:00 2001
From: caimujia <caimujia@gmail.com>
Date: Mon, 1 Jun 2015 11:06:11 +0800
Subject: [PATCH 2/3] Update setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 149408b..0a4569a 100644
--- a/setup.py
+++ b/setup.py
@@ -43,8 +43,8 @@ def download_jars(datapath, version=boilerpipe_version):
     ],
     author='Misja Hoebe',
     author_email='misja.hoebe@gmail.com',
-    maintainer = 'Matthew Russell',
-    maintainer_email = 'ptwobrussell@gmail.com',
+    maintainer = 'Matthew Russell','CaiMujia'
+    maintainer_email = 'ptwobrussell@gmail.com','caimujia@gmail.com'
     url = 'https://github.com/ptwobrussell/python-boilerpipe/',
     classifiers=[
           'Development Status :: 5 - Production/Stable',

From 83c63e5065c3a247f65221efd25071533d059f2f Mon Sep 17 00:00:00 2001
From: caimujia <caimujia@gmail.com>
Date: Mon, 1 Jun 2015 11:13:31 +0800
Subject: [PATCH 3/3] Update README.rst

---
 README.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.rst b/README.rst
index 09fe779..2b52cc4 100644
--- a/README.rst
+++ b/README.rst
@@ -6,6 +6,8 @@ python-boilerpipe
 
 A python wrapper for Boilerpipe_, an excellent Java library for boilerplate removal and fulltext extraction from HTML pages. 
 
+
+
 Configuration
 =============
 
@@ -36,6 +38,8 @@ If no extractor is passed the DefaultExtractor will be used by default. Addition
 
     from boilerpipe.extract import Extractor
     extractor = Extractor(extractor='ArticleExtractor', url=your_url)
+    
+	``you can chose 'url' ,'html' or 'file' as second argment.
 
 Then, to extract relevant content:
 
@@ -46,3 +50,4 @@ Then, to extract relevant content:
 	extracted_html = extractor.getHTML()
 
 .. _Boilerpipe: http://code.google.com/p/boilerpipe/ 
+