Skip to content

Commit

Permalink
check url on esjzone.cc or not
Browse files Browse the repository at this point in the history
  • Loading branch information
ZALin committed Aug 13, 2021
1 parent 1237345 commit b7e5e60
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 24 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# .gitignore

*~
.DS_Store
.DS_Store
*.txt
55 changes: 32 additions & 23 deletions esjbackup.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
#!/usr/bin/env python
#coding=utf-8

import requests
import lxml.html

import re

def write_page(url, dst_file):
r = requests.get(url)
html_element = lxml.html.document_fromstring(r.text)
title = html_element.xpath('//h2')[0]
content = html_element.xpath('//div[@class="forum-content mt-3"]')[0]
with open(dst_file, 'a') as f:
f.write(title.text_content().encode('utf-8')+'\n')
f.write(content.text_content().encode('utf-8')+'\n\n')
r = requests.get(url)
html_element = lxml.html.document_fromstring(r.text)
title = html_element.xpath('//h2')[0]
content = html_element.xpath('//div[@class="forum-content mt-3"]')[0]
with open(dst_file, 'a') as f:
f.write(title.text_content().encode('utf-8')+'\n')
f.write(content.text_content().encode('utf-8')+'\n\n')

if __name__ == "__main__":

novel_id = ''
r = requests.get('https://www.esjzone.cc/detail/' + novel_id + '.html')
html_element = lxml.html.document_fromstring(r.text)

dst_filename = html_element.xpath('//h2[@class="p-t-10 text-normal"]')[0].text_content() + ".txt"
chapter_list = html_element.get_element_by_id("chapterList").getchildren()

for element in chapter_list:

with open(dst_filename, 'a') as f:
#print element.text_content()
f.write(element.text_content().encode('utf-8')+'\n')

if element.tag == 'a' :
write_page(element.attrib['href'],dst_filename)
novel_id = ''
r = requests.get('https://www.esjzone.cc/detail/' + novel_id + '.html')
html_element = lxml.html.document_fromstring(r.text)

dst_filename = html_element.xpath('//h2[@class="p-t-10 text-normal"]')[0].text_content() + ".txt"
chapter_list = html_element.get_element_by_id("chapterList").getchildren()

for element in chapter_list:

with open(dst_filename, 'a') as f:
#print element.text_content()
f.write(element.text_content().encode('utf-8')+'\n')

if element.tag == 'a':

if re.search(r'esjzone\.cc/forum/\d+/\d+\.html', element.attrib['href']):
write_page(element.attrib['href'],dst_filename)
else:
with open(dst_filename, 'a') as f:
#print element.text_content()
f.write(u'{非站內連結,略過}\n'.encode('utf-8'))

0 comments on commit b7e5e60

Please sign in to comment.