diff options
Diffstat (limited to 'plugin.video.mediathekview/classes/ttml2srt.py')
-rw-r--r-- | plugin.video.mediathekview/classes/ttml2srt.py | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/plugin.video.mediathekview/classes/ttml2srt.py b/plugin.video.mediathekview/classes/ttml2srt.py new file mode 100644 index 0000000..e36b41e --- /dev/null +++ b/plugin.video.mediathekview/classes/ttml2srt.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +# Copyright 2017 Laura Klünder +# See https://github.com/codingcatgirl/ttml2srt +# +# MIT License +# +# Copyright (c) 2017 Laura Klünder +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re +import io +import sys +from datetime import timedelta +from xml.etree import ElementTree as ET + +def ttml2srt( infile, outfile ): + tree = ET.parse( infile ) + root = tree.getroot() + + # strip namespaces + for elem in root.getiterator(): + elem.tag = elem.tag.split('}', 1)[-1] + elem.attrib = {name.split('}', 1) + [-1]: value for name, value in elem.attrib.items()} + + # get styles + styles = {} + for elem in root.findall('./head/styling/style'): + style = {} + if 'color' in elem.attrib: + color = elem.attrib['color'] + if color not in ('#FFFFFF', '#000000'): + style['color'] = color + if 'fontStyle' in elem.attrib: + fontstyle = elem.attrib['fontStyle'] + if fontstyle in ('italic', ): + style['fontstyle'] = fontstyle + styles[elem.attrib['id']] = style + + body = root.find('./body') + + # parse correct start and end times + def parse_time_expression(expression, default_offset=timedelta(0)): + offset_time = re.match(r'^([0-9]+(\.[0-9]+)?)(h|m|s|ms|f|t)$', expression) + if offset_time: + time_value, fraction, metric = offset_time.groups() + time_value = float(time_value) + if metric == 'h': + return default_offset + timedelta(hours=time_value) + elif metric == 'm': + return default_offset + timedelta(minutes=time_value) + elif metric == 's': + return default_offset + timedelta(seconds=time_value) + elif metric == 'ms': + return default_offset + timedelta(milliseconds=time_value) + elif metric == 'f': + raise NotImplementedError( + 'Parsing time expressions by frame is not supported!') + elif metric == 't': + raise NotImplementedError( + 'Parsing time expressions by ticks is not supported!') + + clock_time = re.match( + r'^([0-9]{2,}):([0-9]{2,}):([0-9]{2,}(\.[0-9]+)?)$', expression) + if clock_time: + hours, minutes, seconds, fraction = clock_time.groups() + return timedelta(hours=int(hours), minutes=int(minutes), seconds=float(seconds)) + + clock_time_frames = re.match( + r'^([0-9]{2,}):([0-9]{2,}):([0-9]{2,}):([0-9]{2,}(\.[0-9]+)?)$', expression) + if clock_time_frames: + raise NotImplementedError( + 'Parsing time expressions by frame is not supported!') + + raise ValueError('unknown time expression: %s' % expression) + + + def parse_times(elem, default_begin=timedelta(0)): + if 'begin' in elem.attrib: + begin = parse_time_expression( + elem.attrib['begin'], default_offset=default_begin) + else: + begin = default_begin + elem.attrib['{abs}begin'] = begin + + end = None + if 'end' in elem.attrib: + end = parse_time_expression( + elem.attrib['end'], default_offset=default_begin) + + dur = None + if 'dur' in elem.attrib: + dur = parse_time_expression(elem.attrib['dur']) + + if dur is not None: + if end is None: + end = begin + dur + else: + end = min(end, begin + dur) + + elem.attrib['{abs}end'] = end + + for child in elem: + parse_times(child, default_begin=begin) + + + parse_times(body) + + timestamps = set() + for elem in body.findall('.//*[@{abs}begin]'): + timestamps.add(elem.attrib['{abs}begin']) + + for elem in body.findall('.//*[@{abs}end]'): + timestamps.add(elem.attrib['{abs}end']) + + timestamps.discard(None) + + # render subtitles on each timestamp + + + def render_subtitles(elem, timestamp, parent_style={}): + + if timestamp < elem.attrib['{abs}begin']: + return '' + if elem.attrib['{abs}end'] is not None and timestamp >= elem.attrib['{abs}end']: + return '' + + result = '' + + style = parent_style.copy() + if 'style' in elem.attrib: + style.update(styles[elem.attrib['style']]) + + if 'color' in style: + result += '<font color="%s">' % style['color'] + + if style.get('fontstyle') == 'italic': + result += '<i>' + + if elem.text: + result += elem.text.strip() + if len(elem): + for child in elem: + result += render_subtitles(child, timestamp) + if child.tail: + result += child.tail.strip() + + if 'color' in style: + result += '</font>' + + if style.get('fontstyle') == 'italic': + result += '</i>' + + if elem.tag in ('div', 'p', 'br'): + result += '\n' + + return result + + + rendered = [] + for timestamp in sorted(timestamps): + rendered.append((timestamp, re.sub(r'\n\n\n+', '\n\n', + render_subtitles(body, timestamp)).strip())) + + if not rendered: + exit(0) + + # group timestamps together if nothing changes + rendered_grouped = [] + last_text = None + for timestamp, content in rendered: + if content != last_text: + rendered_grouped.append((timestamp, content)) + last_text = content + + # output srt + rendered_grouped.append((rendered_grouped[-1][0] + timedelta(hours=24), '')) + + + def format_timestamp(timestamp): + return ('%02d:%02d:%02.3f' % (timestamp.total_seconds() // 3600, + timestamp.total_seconds() // 60 % 60, + timestamp.total_seconds() % 60)).replace('.', ',') + + + if type( outfile ) is str or type( outfile ) is unicode: + file = io.open( outfile, 'w', encoding='utf-8' ) + else: + file = outfile + + srt_i = 1 + for i, (timestamp, content) in enumerate(rendered_grouped[:-1]): + if content == '': + continue + file.write( bytearray( '%d\n' % srt_i, 'utf-8' ) ) + file.write( bytearray( + format_timestamp( timestamp ) + + ' --> ' + + format_timestamp( rendered_grouped[i + 1][0] ) + + '\n' + ) ) + file.write( bytearray( content + '\n\n', 'utf-8' ) ) + srt_i += 1 + file.close() |