diff --git a/tests/subtitles/sample_hls.vtt b/tests/subtitles/sample_hls.vtt new file mode 100644 index 0000000..dcc5318 --- /dev/null +++ b/tests/subtitles/sample_hls.vtt @@ -0,0 +1,50 @@ +WEBVTT +X-TIMESTAMP-MAP=LOCAL:1086:59:52.424,MPEGTS:183000 + +1088:42:00.500 --> 1088:42:07.000 +Caption text #1 + +1088:42:07.000 --> 1088:42:11.890 +Caption text #2 + +1088:42:11.890 --> 1088:42:16.320 +Caption text #3 + +1088:42:16.320 --> 1088:42:21.580 +Caption text #4 + +1088:42:21.580 --> 1088:42:23.880 +Caption text #5 + +1088:42:23.880 --> 1088:42:27.280 +Caption text #6 + +1088:42:27.280 --> 1088:42:30.280 +Caption text #7 + +1088:42:30.280 --> 1088:42:36.510 +Caption text #8 + +1088:42:36.510 --> 1088:42:38.870 +Caption text #9 + +1088:42:38.870 --> 1088:42:45.000 +Caption text #10 + +1088:42:45.000 --> 1088:42:47.000 +Caption text #11 + +1088:42:47.000 --> 1088:42:50.970 +Caption text #12 + +1088:42:50.970 --> 1088:42:54.440 +Caption text #13 + +1088:42:54.440 --> 1088:42:58.600 +Caption text #14 + +1088:42:58.600 --> 1088:43:01.350 +Caption text #15 + +1088:43:01.350 --> 1088:43:04.300 +Caption text #16 diff --git a/tests/webvtt.py b/tests/webvtt.py index e54a0b8..720afba 100644 --- a/tests/webvtt.py +++ b/tests/webvtt.py @@ -2,8 +2,9 @@ import io from shutil import rmtree, copy +from webvtt.webvtt import HlsWebVTT import webvtt -from webvtt.structures import Caption, Style +from webvtt.structures import Caption, Style, TimestampMap from .generic import GenericParserTestCase from webvtt.errors import MalformedFileError @@ -244,6 +245,40 @@ def test_read_malformed_buffer(self): with self.assertRaises(MalformedFileError): webvtt.read_buffer(buffer) + def test_timestamp_map(self): + tag_one = 'X-TIMESTAMP-MAP=LOCAL:1086:59:52.424,MPEGTS:183000' + tag_two = 'X-TIMESTAMP-MAP=MPEGTS:183000,LOCAL:1086:59:52.424' + timestamp_map_one = TimestampMap(tag_one) + timestamp_map_two = TimestampMap(tag_two) + + self.assertEqual( + timestamp_map_one.local, + '1086:59:52.424' + ) + self.assertEqual( + timestamp_map_two.local, + '1086:59:52.424' + ) + self.assertEqual( + timestamp_map_one.mpegts, + '183000' + ) + self.assertEqual( + timestamp_map_two.mpegts, + '183000' + ) + + def test_read_hls_webvtt(self): + vtt = HlsWebVTT.read(self._get_file('sample_hls.vtt')) + self.assertIsInstance(vtt.captions, list) + self.assertIsInstance(vtt.timestamp_map, TimestampMap) + self.assertTrue(vtt.timestamp_map.local) + self.assertTrue(vtt.timestamp_map.mpegts) + + def test_read_non_hls_webvtt_with_hls_webvtt_parser(self): + vtt = HlsWebVTT.read(self._get_file('sample.vtt')) + self.assertIsInstance(vtt.captions, list) + self.assertIsNone(vtt.timestamp_map) def test_captions(self): vtt = webvtt.read(self._get_file('sample.vtt')) diff --git a/webvtt/errors.py b/webvtt/errors.py index 9ee549b..4a10329 100644 --- a/webvtt/errors.py +++ b/webvtt/errors.py @@ -16,3 +16,7 @@ class InvalidCaptionsError(Exception): class MissingFilenameError(Exception): """Error raised when saving a file without filename.""" + +class HlsTimeMapError(Exception): + """Error raised when an Hls Timestamp map is malformed""" + diff --git a/webvtt/parsers.py b/webvtt/parsers.py index 011cad8..9889dbb 100644 --- a/webvtt/parsers.py +++ b/webvtt/parsers.py @@ -3,7 +3,7 @@ import codecs from .errors import MalformedFileError, MalformedCaptionError -from .structures import Block, Style, Caption +from .structures import Block, Style, Caption, TimestampMap class TextBasedParser(object): @@ -249,6 +249,18 @@ def _is_style_block(self, block): """Returns True if it is a style block""" return re.match(self.STYLE_PATTERN, block.lines[0]) +class HlsWebVTTParser(WebVTTParser): + """ + HLS WebVTT parser. Support X-TIMESTAMP-MAP tag + """ + def _parse(self, lines): + try: + self.timestamp_map = TimestampMap( + next(line for line in lines if line.upper().startswith('X-TIMESTAMP-MAP')) + ) + except StopIteration: + self.timestamp_map = None + super(HlsWebVTTParser, self)._parse(lines) class SBVParser(TextBasedParser): """ diff --git a/webvtt/structures.py b/webvtt/structures.py index 84f376d..97f9421 100644 --- a/webvtt/structures.py +++ b/webvtt/structures.py @@ -1,11 +1,60 @@ import re -from .errors import MalformedCaptionError +from .errors import MalformedCaptionError, HlsTimeMapError TIMESTAMP_PATTERN = re.compile('(\d+)?:?(\d{2}):(\d{2})[.,](\d{3})') +TIMESTAMP_MAP_LOCAL_PATTERN = re.compile( + 'X-TIMESTAMP-MAP\=.*?LOCAL\:(?P(\d+)?:?(\d{2}):(\d{2})[.,](\d{3}))', + re.IGNORECASE +) + +TIMESTAMP_MAP_MPEGTS_PATTERN = re.compile( + 'X-TIMESTAMP-MAP\=.*?MPEGTS\:(?P(\d+))', + re.IGNORECASE +) __all__ = ['Caption'] +class TimestampMap(object): + def __init__(self, raw_timestamp_map): + if 'X-TIMESTAMP-MAP' not in raw_timestamp_map: + raise HlsTimeMapError('X-TIMESTAMP-MAP tag not found') + self._raw = raw_timestamp_map + + def __str__(self): + return 'X-TIMESTAMP-MAP=LOCAL:%s,MPEGTS:%s'%(self.local, self.mpegts) + @property + def local(self): + regex = re.search( + TIMESTAMP_MAP_LOCAL_PATTERN, + self._raw, + ) + if regex is None: + return None + else: + try: + return regex.group('local') + except (AttributeError,IndexError) as e: + return None + except Exception as e: + raise HlsTimeMapError('could not parse LOCAL') from e + + @property + def mpegts(self): + regex = re.search( + TIMESTAMP_MAP_MPEGTS_PATTERN, + self._raw, + ) + if regex is None: + return None + else: + try: + return regex.group('mpegts') + except (AttributeError,IndexError) as e: + return None + except Exception as e: + raise HlsTimeMapError('could not parse MPEGTS') from e + class Caption(object): diff --git a/webvtt/webvtt.py b/webvtt/webvtt.py index 139181d..800ce62 100644 --- a/webvtt/webvtt.py +++ b/webvtt/webvtt.py @@ -1,6 +1,6 @@ import os -from .parsers import WebVTTParser, SRTParser, SBVParser +from .parsers import WebVTTParser, HlsWebVTTParser, SRTParser, SBVParser from .writers import WebVTTWriter, SRTWriter from .errors import MissingFilenameError @@ -131,3 +131,35 @@ def total_length(self): @property def styles(self): return self._styles + +class HlsWebVTT(WebVTT): + def __init__(self, file='', captions=None, styles=None, timestamp_map=None): + self._timestamp_map = timestamp_map + super(HlsWebVTT, self).__init__(file,captions, styles) + + @property + def timestamp_map(self): + return self._timestamp_map + + @classmethod + def read(cls, file): + """Reads a WebVTT captions file.""" + parser = HlsWebVTTParser().read(file) + return cls(file=file, captions=parser.captions, styles=parser.styles, timestamp_map=parser.timestamp_map) + + @classmethod + def read_buffer(cls, buffer): + """Reads a WebVTT captions from a file-like object. + Such file-like object may be the return of an io.open call, + io.StringIO object, tempfile.TemporaryFile object, etc.""" + parser = HlsWebVTTParser().read_from_buffer(buffer) + return cls(captions=parser.captions, styles=parser.styles, timestamp_map=parser.timestamp_map) + + def write(self, f, format='vtt'): + if format == 'vtt': + WebVTTWriter( + header_lines=['WEBVTT',str(self.timestamp_map)] + ).write(self._captions, f) + elif format == 'srt': + SRTWriter().write(self._captions, f) + diff --git a/webvtt/writers.py b/webvtt/writers.py index f4ae812..edac4c1 100644 --- a/webvtt/writers.py +++ b/webvtt/writers.py @@ -1,8 +1,12 @@ class WebVTTWriter(object): + def __init__(self, header_lines=['WEBVTT']): + self._header_lines = header_lines def write(self, captions, f): - f.write('WEBVTT\n') + for line in self._header_lines: + f.write(line) + f.write('\n') for c in captions: if c.identifier: f.write('\n' + c.identifier)