# -*- coding: utf-8 -*- # Copyright 2017 Laura Klünder # See https://github.com/codingcatgirl/ttml2srt # # MIT License # # Copyright (c) 2017 Laura Klünder # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re import io from datetime import timedelta from xml.etree import ElementTree as ET def ttml2srt( infile, outfile ): tree = ET.parse( infile ) root = tree.getroot() # strip namespaces for elem in root.getiterator(): elem.tag = elem.tag.split('}', 1)[-1] elem.attrib = {name.split('}', 1) [-1]: value for name, value in elem.attrib.items()} # get styles styles = {} for elem in root.findall('./head/styling/style'): style = {} if 'color' in elem.attrib: color = elem.attrib['color'] if color not in ('#FFFFFF', '#000000'): style['color'] = color if 'fontStyle' in elem.attrib: fontstyle = elem.attrib['fontStyle'] if fontstyle in ('italic', ): style['fontstyle'] = fontstyle styles[elem.attrib['id']] = style body = root.find('./body') # parse correct start and end times def parse_time_expression(expression, default_offset=timedelta(0)): offset_time = re.match(r'^([0-9]+(\.[0-9]+)?)(h|m|s|ms|f|t)$', expression) if offset_time: time_value, fraction, metric = offset_time.groups() time_value = float(time_value) if metric == 'h': return default_offset + timedelta(hours=time_value) elif metric == 'm': return default_offset + timedelta(minutes=time_value) elif metric == 's': return default_offset + timedelta(seconds=time_value) elif metric == 'ms': return default_offset + timedelta(milliseconds=time_value) elif metric == 'f': raise NotImplementedError( 'Parsing time expressions by frame is not supported!') elif metric == 't': raise NotImplementedError( 'Parsing time expressions by ticks is not supported!') clock_time = re.match( r'^([0-9]{2,}):([0-9]{2,}):([0-9]{2,}(\.[0-9]+)?)$', expression) if clock_time: hours, minutes, seconds, fraction = clock_time.groups() return timedelta(hours=int(hours), minutes=int(minutes), seconds=float(seconds)) clock_time_frames = re.match( r'^([0-9]{2,}):([0-9]{2,}):([0-9]{2,}):([0-9]{2,}(\.[0-9]+)?)$', expression) if clock_time_frames: raise NotImplementedError( 'Parsing time expressions by frame is not supported!') raise ValueError('unknown time expression: %s' % expression) def parse_times(elem, default_begin=timedelta(0)): if 'begin' in elem.attrib: begin = parse_time_expression( elem.attrib['begin'], default_offset=default_begin) else: begin = default_begin elem.attrib['{abs}begin'] = begin end = None if 'end' in elem.attrib: end = parse_time_expression( elem.attrib['end'], default_offset=default_begin) dur = None if 'dur' in elem.attrib: dur = parse_time_expression(elem.attrib['dur']) if dur is not None: if end is None: end = begin + dur else: end = min(end, begin + dur) elem.attrib['{abs}end'] = end for child in elem: parse_times(child, default_begin=begin) parse_times(body) timestamps = set() for elem in body.findall('.//*[@{abs}begin]'): timestamps.add(elem.attrib['{abs}begin']) for elem in body.findall('.//*[@{abs}end]'): timestamps.add(elem.attrib['{abs}end']) timestamps.discard(None) # render subtitles on each timestamp def render_subtitles(elem, timestamp, parent_style=None): if timestamp < elem.attrib['{abs}begin']: return '' if elem.attrib['{abs}end'] is not None and timestamp >= elem.attrib['{abs}end']: return '' result = '' style = parent_style.copy() if parent_style is not None else {} if 'style' in elem.attrib: style.update(styles[elem.attrib['style']]) if 'color' in style: result += '' % style['color'] if style.get('fontstyle') == 'italic': result += '' if elem.text: result += elem.text.strip() if len(elem): for child in elem: result += render_subtitles(child, timestamp) if child.tail: result += child.tail.strip() if 'color' in style: result += '' if style.get('fontstyle') == 'italic': result += '' if elem.tag in ('div', 'p', 'br'): result += '\n' return result rendered = [] for timestamp in sorted(timestamps): rendered.append((timestamp, re.sub(r'\n\n\n+', '\n\n', render_subtitles(body, timestamp)).strip())) if not rendered: exit(0) # group timestamps together if nothing changes rendered_grouped = [] last_text = None for timestamp, content in rendered: if content != last_text: rendered_grouped.append((timestamp, content)) last_text = content # output srt rendered_grouped.append((rendered_grouped[-1][0] + timedelta(hours=24), '')) def format_timestamp(timestamp): return ('%02d:%02d:%02.3f' % (timestamp.total_seconds() // 3600, timestamp.total_seconds() // 60 % 60, timestamp.total_seconds() % 60)).replace('.', ',') if isinstance( outfile, str ) or isinstance( outfile, unicode ): file = io.open( outfile, 'w', encoding='utf-8' ) else: file = outfile srt_i = 1 for i, (timestamp, content) in enumerate(rendered_grouped[:-1]): if content == '': continue file.write( bytearray( '%d\n' % srt_i, 'utf-8' ) ) file.write( bytearray( format_timestamp( timestamp ) + ' --> ' + format_timestamp( rendered_grouped[i + 1][0] ) + '\n' ) ) file.write( bytearray( content + '\n\n', 'utf-8' ) ) srt_i += 1 file.close()