Skip to content

Commit

Permalink
feat: Side-by-side translation for content with line breaks. resolved y…
Browse files Browse the repository at this point in the history
  • Loading branch information
bookfere committed Mar 25, 2024
1 parent 62cdbc8 commit a046c1e
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 363 deletions.
11 changes: 6 additions & 5 deletions advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def clean_cache(self, cache):

@pyqtSlot()
def prepare_ebook_data(self):
self.progress_detail.emit(
'Start processing the ebook: %s' % self.ebook.title)
input_path = self.ebook.get_input_path()
element_handler = get_element_handler(
self.engine_class.placeholder, self.engine_class.separator)
Expand All @@ -88,6 +86,8 @@ def prepare_ebook_data(self):
cache = get_cache(cache_id)

if cache.is_fresh() or not cache.is_persistence():
self.progress_detail.emit(
'Start processing the ebook: %s' % self.ebook.title)
cache.set_info('title', self.ebook.title)
cache.set_info('engine_name', self.engine_class.name)
cache.set_info('target_lang', self.ebook.target_lang)
Expand Down Expand Up @@ -133,9 +133,10 @@ def prepare_ebook_data(self):
if self.cancel():
self.clean_cache(cache)
return

self.progress_detail.emit(
'The ebook content was extracted successfully.')
else:
self.progress_detail.emit(
'Loading data from cache and preparing user interface...')
time.sleep(0.1)

self.finished.emit(cache_id)

Expand Down
24 changes: 18 additions & 6 deletions batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from .lib.config import get_config
from .lib.translation import get_engine_class
from .lib.conversion import extra_formats
from .lib.encodings import encoding_list
from .engines.custom import CustomTranslate
from .components import (
layout_info, AlertMessage, SourceLang, TargetLang, InputFormat,
Expand All @@ -12,11 +14,11 @@
try:
from qt.core import (
QDialog, QWidget, QPushButton, QHeaderView, QVBoxLayout, QTableWidget,
QTableWidgetItem, Qt)
QTableWidgetItem, Qt, QComboBox, QLabel)
except ImportError:
from PyQt5.Qt import (
QDialog, QWidget, QPushButton, QHeaderView, QVBoxLayout, QTableWidget,
QTableWidgetItem, Qt)
QTableWidgetItem, Qt, QComboBox, QLabel)

load_translations()

Expand Down Expand Up @@ -52,7 +54,7 @@ def layout_translate(self):
table.setRowCount(len(self.ebooks))
table.setColumnCount(5)
table.setHorizontalHeaderLabels([
_('Title'), _('Input Format'), _('Output Format'),
_('Title'), _('Encoding'), _('Input Format'), _('Output Format'),
_('Source Language'), _('Target Language')])

header = table.horizontalHeader()
Expand All @@ -68,11 +70,21 @@ def layout_translate(self):
ebook_title.setSizeHint(table.sizeHint())
table.setItem(row, 0, ebook_title)

if ebook.input_format in extra_formats.keys():
input_encoding = QComboBox()
input_encoding.addItems(encoding_list)
input_encoding.currentTextChanged.connect(
lambda encoding, row=row: self.ebooks[row]
.set_encoding(encoding))
else:
input_encoding = QLabel(_('Default'))
table.setCellWidget(row, 2, input_encoding)

input_fmt = InputFormat(ebook.files.keys())
table.setCellWidget(row, 1, input_fmt)

output_format = OutputFormat()
table.setCellWidget(row, 2, output_format)
table.setCellWidget(row, 3, output_format)

exist_format = output_format.findText(ebook.input_format)
if ebook.is_extra_format() and exist_format:
Expand All @@ -94,7 +106,7 @@ def change_output_format(format, row=row):
output_format.currentTextChanged.connect(change_output_format)

source_lang = SourceLang(book_lang=ebook.source_lang)
table.setCellWidget(row, 3, source_lang)
table.setCellWidget(row, 4, source_lang)
self.source_langs.append(source_lang)

def change_source_lang(lang, row=row):
Expand All @@ -108,7 +120,7 @@ def change_source_lang(lang, row=row):
not issubclass(translation_engine, CustomTranslate))

target_lang = TargetLang()
table.setCellWidget(row, 4, target_lang)
table.setCellWidget(row, 5, target_lang)
self.target_langs.append(target_lang)

def change_target_lang(lang, row=row):
Expand Down
84 changes: 63 additions & 21 deletions lib/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,6 @@ def get_attributes(self):
attributes = dict(self.element.attrib.items())
return json.dumps(attributes) if attributes else None

def delete(self):
self.element.getparent().remove(self.element)

def _safe_remove(self, element, replacement=''):
previous, parent = element.getprevious(), element.getparent()
if previous is not None:
Expand All @@ -211,23 +208,25 @@ def get_content(self):
element_copy = self._element_copy()
for noise in self._get_descendents(element_copy, ('rt', 'rp')):
self._safe_remove(noise)

# Reserve the <br> element instead of using a line break to prevent
# conflicts with the mechanism of merge translation.
target_elements = (
'img', 'code', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr', 'var',
'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr', 'var',
'canvas', 'svg', 'script', 'style')
self.reserve_elements = self._get_descendents(
element_copy, target_elements)
for eid, reserve in enumerate(self.reserve_elements):
replacement = self.placeholder[0].format(format(eid, '05'))
parent = reserve.getparent()
if parent is not None and get_name(parent) == 'a':
index = self.reserve_elements.index(reserve)
self.reserve_elements[index] = reserve = parent
if get_name(reserve) in ['sub', 'sup']:
parent = reserve.getparent()
if parent is not None and get_name(parent) == 'a' and \
parent.text is None and reserve.tail is None and \
len(parent.getchildren()) == 1:
index = self.reserve_elements.index(reserve)
self.reserve_elements[index] = reserve = parent
self._safe_remove(reserve, replacement)
for br in self._get_descendents(element_copy, 'br'):
self._safe_remove(br, '<br/>')

return trim(''.join(element_copy.itertext())).replace('<br/>', '\n')
return trim(''.join(element_copy.itertext()))

def _polish_translation(self, translation):
translation = translation.replace('\n', '<br/>')
Expand Down Expand Up @@ -261,7 +260,7 @@ def add_translation(self, translation=None):
if translation is None:
if self.position in ('left', 'right'):
self.element.addnext(self._create_table())
self.delete()
self._safe_remove(self.element)
return
# Escape the markups (<m id=1 />) to replace escaped markups.
translation = xml_escape(translation)
Expand All @@ -282,7 +281,7 @@ def add_translation(self, translation=None):
if element_name in group_elements:
if self.position == 'only':
self.element.addnext(new_element)
self.delete()
self._safe_remove(self.element)
new_element = self._create_new_element(
'span', translation, excluding_tags=['class'])
if self.position in ['left', 'above']:
Expand Down Expand Up @@ -320,25 +319,68 @@ def add_translation(self, translation=None):
# Add translation for left or right position.
if self.position in ('left', 'right') and not is_text_element:
self.element.addnext(self._create_table(new_element))
self.delete()
self._safe_remove(self.element)
return

# TODO: Needs to be optimized for various situations.
# Add translation for line breaks.
line_break_tag = '{%s}br' % ns['x']
original_br_list = list(self.element.iterchildren(line_break_tag))
translation_br_list = list(new_element.iterchildren(line_break_tag))
if len(original_br_list) == len(translation_br_list) >= 5:
tail = None
for index, br in enumerate(original_br_list):
new_br = etree.SubElement(self.element, 'br')
translation_br = translation_br_list[index]
br.addprevious(new_br)
if self.position == 'below':
for sibling in translation_br.itersiblings(preceding=True):
if get_name(sibling) == 'br':
break
new_br.addnext(sibling)
new_br.tail = new_element.text if index == 0 else tail
tail = translation_br.tail
if br == original_br_list[-1]:
new_br = etree.SubElement(self.element, 'br')
self.element.append(new_br)
translation_br = translation_br_list[-1]
for sibling in translation_br.itersiblings():
new_br.addnext(sibling)
new_br.tail = translation_br.tail
else:
for sibling in translation_br.itersiblings():
if get_name(sibling) == 'br':
break
new_br.addnext(sibling)
new_br.tail = translation_br.tail
if br == original_br_list[-1]:
new_br = etree.SubElement(self.element, 'br')
new_br.tail = self.element.text
self.element.text = new_element.text
self.element.insert(0, new_br)
translation_br = translation_br_list[0]
sblings = list(
translation_br.itersiblings(preceding=True))
for sibling in reversed(sblings):
new_br.addprevious(sibling)
return

parent = self.element.getparent()
is_table_element = parent is not None and \
get_name(parent) in group_elements
parent_element = self.element.getparent()
is_table_descendant = parent_element is not None and \
get_name(parent_element) in group_elements

if self.position in ('left', 'above'):
self.element.addprevious(new_element)
if is_text_element and is_table_element:
if is_text_element and is_table_descendant:
new_element.addnext(etree.SubElement(self.element, 'br'))
elif is_text_element:
new_element.tail = ' '
else:
self.element.addnext(new_element)
if self.position == 'only':
self.delete()
self._safe_remove(self.element)
return
if is_text_element and is_table_element:
if is_text_element and is_table_descendant:
self.element.addnext(etree.SubElement(self.element, 'br'))
elif is_text_element:
if self.element.tail is not None:
Expand Down
5 changes: 2 additions & 3 deletions lib/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,8 @@ def handle(self, paragraphs=[]):
raise Exception(_('Translation failed.'))
consuming = round((time.time() - start_time) / 60, 2)
self.log('Time consuming: %s minutes' % consuming)
message = _('Translation completed.')
self.log(message)
self.progress(1, message)
self.log(_('Translation completed.'))
self.progress(1, _('Outputting ebook file...'))


def get_engine_class(engine_name=None):
Expand Down
98 changes: 93 additions & 5 deletions tests/test_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def test_get_raw(self):
def test_get_text(self):
self.assertEqual('a', self.element.get_text())

def test_test_content(self):
def test_get_content(self):
self.assertEqual('a', self.element.get_content())

def test_add_translation_only(self):
Expand Down Expand Up @@ -381,14 +381,38 @@ def test_get_text(self):
def test_get_content(self):
content = ('{{id_00000}} a {{id_00001}} b c {{id_00002}} d e '
'{{id_00003}} f g {{id_00004}} h {{id_00005}} i '
'{{id_00006}} {{id_00007}} k{{id_00008}}\nl')
'{{id_00006}} {{id_00007}} k{{id_00008}}{{id_00009}}l')
self.assertEqual(content, self.element.get_content())
self.assertEqual(9, len(self.element.reserve_elements))
self.assertEqual(10, len(self.element.reserve_elements))

for element in self.element.reserve_elements:
with self.subTest(element=element):
self.assertIsNone(element.tail)

def test_get_content_with_sup_sub(self):
xhtml = etree.XML(rb"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head><title>Test Document</title></head>
<body>
<p class="abc">
a<a><sup>[1]</sup></a>
b<a><sup>[1]</sup> <span>x</span></a>
c<a>x <sup>[2]</sup></a>
d<a><sup>[3]</sup> x</a>
</p>
</body>
</html>""")
self.element = PageElement(xhtml.find('.//x:p', namespaces=ns), 'p1')
self.element.placeholder = Base.placeholder
content = (
'a{{id_00000}} b{{id_00001}} x cx {{id_00002}} d{{id_00003}} x')
self.assertEqual(content, self.element.get_content())
self.assertEqual('a', get_name(self.element.reserve_elements[0]))
self.assertEqual('sup', get_name(self.element.reserve_elements[1]))
self.assertEqual('sup', get_name(self.element.reserve_elements[2]))
self.assertEqual('sup', get_name(self.element.reserve_elements[3]))

def test_get_attributes(self):
self.assertEqual('{"class": "abc"}', self.element.get_attributes())

Expand Down Expand Up @@ -451,7 +475,8 @@ def test_add_translation_with_placeholder(self):
translation = ('{{id_00000}} Aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa '
'{{id_00001}} Bbbbbbbbbbbbbbb C {{id_00002}} D E '
'{{id_00003}} F G {{id_00004}} H \n\n{{id_00005}} I '
'{{id_00006}} {{id_00007}} K{ { id _ 0 00 08 } }\nL')
'{{id_00006}} {{id_00007}} K{ { id _ 0 00 08 } }'
'{{id_00009}}L')
self.element.add_translation(translation)

translation = ('<p xmlns="http://www.w3.org/1999/xhtml" class="abc">'
Expand All @@ -473,7 +498,8 @@ def test_add_translation_with_markup(self):
self.element.get_content()
translation = ('<m id=00000 /> A <m id=00001 /> B C <m id=00002 /> D '
'E <m id=00003 /> F G <m id=00004 /> H <m id=00005 /> '
'I <m id=00006 /> <m id=00007 /> K<m id=00008 />\nL')
'I <m id=00006 /> <m id=00007 /> K<m id=00008 />'
'<m id=00009 />L')
self.element.add_translation(translation)

translation = ('<p xmlns="http://www.w3.org/1999/xhtml" class="abc">'
Expand Down Expand Up @@ -589,6 +615,68 @@ def test_add_translation_table(slef):
def test_add_translation_table_only(slef):
pass

def test_add_translation_line_break_below(self):
xhtml = etree.XML(rb"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head><title>Test Document</title></head>
<body>
<p>
a<sup>[1]</sup> <span>b</span> c<sup>[2]</sup><br />
<em>d</em> e <img src="/test.jpg" /><br />
f <strong>g</strong><br/>h<br/>i<br/>j
</p>
</body>
</html>""")

element = PageElement(xhtml.find('.//x:p', namespaces=ns), 'p1')
element.placeholder = Base.placeholder
element.position = 'below'
element.get_content()
element.add_translation(
'A{{id_00000}} B C{{id_00001}}{{id_00002}} D E {{id_00003}}'
'{{id_00004}} F G{{id_00005}}H{{id_00006}}I{{id_00007}}J ')
translation = (
'<html xmlns="http://www.w3.org/1999/xhtml" lang="en"> <head>'
'<title>Test Document</title></head> <body> '
'<p> a<sup>[1]</sup> <span>b</span> c<sup>[2]</sup><br/>'
'A<sup>[1]</sup> B C<sup>[2]</sup><br/> <em>d</em> e '
'<img src="/test.jpg"/><br/> D E <img src="/test.jpg"/><br/>'
' f <strong>g</strong><br/> F G<br/>h<br/>H<br/>i<br/>I<br/>j '
'<br/>J</p> </body> </html>')
self.assertEqual(translation, get_string(xhtml))

def test_add_translation_line_break_above(self):
xhtml = etree.XML(rb"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head><title>Test Document</title></head>
<body>
<p>
a<sup>[1]</sup> <span>b</span> c<sup>[2]</sup><br />
<em>d</em> e <img src="/test.jpg" /><br />
f <strong>g</strong><br/>h<br/>i<br/>j
</p>
</body>
</html>""")

element = PageElement(xhtml.find('.//x:p', namespaces=ns), 'p1')
element.placeholder = Base.placeholder
element.position = 'above'
element.get_content()
element.add_translation(
'A{{id_00000}} B C{{id_00001}}{{id_00002}} D E {{id_00003}}'
'{{id_00004}} F G{{id_00005}}H{{id_00006}}I{{id_00007}}J')
translation = (
'<html xmlns="http://www.w3.org/1999/xhtml" lang="en"> <head>'
'<title>Test Document</title></head> <body> '
'<p>A<sup>[1]</sup> B C<sup>[2]</sup><br/>'
' a<sup>[1]</sup> <span>b</span> c<sup>[2]</sup><br/>'
' D E <img src="/test.jpg"/><br/> <em>d</em> e '
'<img src="/test.jpg"/><br/> F G<br/> f <strong>g</strong><br/>'
'H<br/>h<br/>I<br/>i<br/>J<br/>j </p> </body> </html>')
self.assertEqual(translation, get_string(xhtml))

def test_add_translation_attr(self):
self.element.translation_lang = 'zh'
self.element.original_color = 'green'
Expand Down
Loading

0 comments on commit a046c1e

Please sign in to comment.