-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsewtomd.py
executable file
·154 lines (130 loc) · 5.3 KB
/
sewtomd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#! /bin/env python
"""
sewtomd.py filters HTML files exported from Confluence wiki pages and converts
them to markdown using pandoc. Removing special div elements from the exported
html creates a cleaner markdown file from pandoc.
"""
import shutil
import sys
import logging
import argparse
from bs4 import BeautifulSoup
import subprocess as sp
from pathlib import Path
logger = logging.getLogger(__name__)
class ConfluenceConverter:
html_path: Path
resource_path: Path
markdown_path: Path
toc_depth: int
def __init__(self) -> None:
self.resource_path = None
self.toc_depth = 0
self.html_path = None
self.markdown_path = None
self.soup = None
def set_html_path(self, path):
"Setup the html input path and a resource path relative to that."
self.html_path = Path(path)
self.resource_path = self.html_path.parent
def load_html(self):
"Read the html input at path and parse into a soup tree."
with open(self.html_path) as htmldoc:
self.soup = BeautifulSoup(htmldoc, "html.parser")
def set_markdown_path(self, mdpath):
self.markdown_path = Path(mdpath)
def delete_tags(self, name, **args):
for tag in self.soup.find_all(name, **args):
logger.debug("deleting: %s, id=%s, class=%s", tag.name,
tag.get('id'), tag.get('class'))
tag.decompose()
def rename_image(self, src, alt):
"""
Given the alt and src attributes of an img, generate a new more
descriptive file name.
"""
# make sure src is a path
src = Path(src)
# use alt as the name if set, with the right extension
srcfile = Path(src.name)
if alt:
dst = str(alt)
dst = dst.replace(" ", "_")
dst = Path(dst.replace("/", ""))
if dst.suffix != srcfile.suffix:
dst = Path(str(dst) + srcfile.suffix)
else:
dst = srcfile
return dst
def resolve_image(self, src) -> Path or None:
"Return a Path where src exists locally, else None."
srcp = None
if src is not None:
srcp = Path(src)
if not srcp.exists():
srcp = self.resource_path.joinpath(srcp)
if not srcp.exists():
srcp = None
logger.info("src %s: %s", src,
f"found at path: {srcp}" if srcp else "not found")
return srcp
def modify_html(self):
"Modify the html document to make it suitable for pandoc."
headers = self.soup.find_all(['h' + str(i) for i in range(1, 6)])
for h in headers:
logger.debug("header: %s, content=%s", h.name, h.string)
self.delete_tags('div', class_='page-metadata')
self.delete_tags('div', id='footer')
self.delete_tags('div', id="breadcrumb-section")
self.delete_tags('div', class_="pageSection group")
# Find all img tags and print the alt attribute.
for img in self.soup.find_all('img'):
alt = img.get('alt')
# assuming src attribute will never be missing
src = img.get('src')
logger.info("Found img: alt='%s', src='%s', data-image-src='%s'",
alt, src, img.get('data-image-src'))
# See if this path exists locally, perhaps relative to the
# html path.
src = self.resolve_image(src)
if src:
dst = self.rename_image(src, alt)
logger.info("copying %s to %s...", src, dst)
shutil.copy(src, dst)
# And update the img location.
img['src'] = str(dst)
logger.debug("updated img tag: %s", repr(img))
def write_markdown(self):
"Run the html through pandoc to get github-flavored markdown"
# The no-highlight omits the syntaxhighlighter-pre language selector
# from confluence, since that looks useless and not portable. We don't
# use --extract-media because that downloads images whose src is a url.
# Instead only locally existing files (ie attachments) are copied and
# renamed when the html is filtered.
cmd = str(f"pandoc --strip-comments --standalone "
f"--no-highlight "
f"--from html-native_divs-native_spans --to gfm")
if self.toc_depth > 0:
cmd += f" --toc --toc-depth={self.toc_depth}"
logger.info("writing markdown to '%s': %s", self.markdown_path, cmd)
args = cmd.split()
output = self.markdown_path.open("w")
pd = sp.Popen(args, shell=False, stdin=sp.PIPE, stdout=output)
pd.stdin.write(self.soup.encode("utf8"))
pd.stdin.close()
def main(argv):
parser = argparse.ArgumentParser("sewtomd.py",
description=__doc__)
parser.add_argument("html", help="HTML input file")
parser.add_argument("markdown",
help="Write output to markdown path.")
args = parser.parse_args(argv)
logging.basicConfig(level=logging.INFO)
cc = ConfluenceConverter()
cc.set_html_path(args.html)
cc.set_markdown_path(args.markdown)
cc.load_html()
cc.modify_html()
cc.write_markdown()
if __name__ == "__main__":
main(sys.argv[1:])