aaoth.xyz repos — tg2md (856df0ec5823ae12682fe4c4bcd4c15a062be1dc): tg2md.py

tg2md.py (view raw)
  1#!/usr/bin/env python
  2
  3# parse.py - converts telegram json to jekyll md.
  4# Copyright (c) 2020, Lev Brekalov
  5
  6# TODO summary:
  7# - replies
  8# - single/muliple tags
  9# - forwarded posts
 10# - custom post header
 11# - multiple photos in one post
 12
 13import os
 14import argparse
 15import json
 16from datetime import datetime
 17
 18
 19def print_default_post_header(post_title, post_date, post_tag):
 20
 21    '''
 22    returns default post header
 23    '''
 24
 25    # TODO: handle post tag/tags
 26    # TODO: support for custom header
 27    post_header = '---\n'\
 28        'title: {title}\n'\
 29        'date: {date}\n'\
 30        'tags: {tag}\n'\
 31        'layout: post\n'\
 32        '---\n'.format(title=post_title, date=post_date, tag=post_tag)
 33
 34    return post_header
 35
 36
 37def print_custom_post_header(post_header_file, *args):
 38
 39    '''
 40    now unusable (i dunno how it may work)
 41    '''
 42
 43    with post_header_file as f:
 44        post_header_content = read(post_header_file)
 45    for arg in args:
 46        pass
 47    return post_header_content
 48
 49
 50def parse_post_photo(post, photo_dir):
 51
 52    '''
 53    converts photo tag to markdown image link
 54    '''
 55
 56    post_photo_src = os.path.basename(post['photo'])
 57    post_photo_src = os.path.join(photo_dir, post_photo_src)
 58    post_photo = '![image]({src})\n\n'.format(src=post_photo_src)
 59
 60    return post_photo
 61
 62
 63def parse_post_photo_as_file(post, media_dir):
 64
 65    '''
 66    converts file tag with thumbnail to image and a link
 67    '''
 68
 69    # links to files are currently broken, because these files are
 70    # going to `files` directory, not `photos`.
 71    # need to track down any files with thumbnails and then to move them
 72    # to a photos directory.
 73    post_photo_file_src = os.path.basename(post['file'])
 74    post_photo_file_src = os.path.join(media_dir, post_photo_file_src)
 75    post_photo_thumbnail_src = os.path.basename(post['thumbnail'])
 76    post_photo_thumbnail_src = os.path.join(media_dir,
 77                                            post_photo_thumbnail_src)
 78
 79    post_photo_as_file = '![image]({thumb})\n[full size]({file})\n\n'\
 80        .format(thumb=post_photo_thumbnail_src, file=post_photo_file_src)
 81
 82    return post_photo_as_file
 83
 84
 85def text_format(string, fmt):
 86
 87    '''
 88    wraps string in markdown-styled formatting
 89    '''
 90
 91    if fmt in ('*', '**', '***', '`', '```'):
 92        output = '{fmt}{txt}{fmt}'
 93    elif fmt == '```':
 94        output = '{fmt}\n{txt}\n{fmt}'
 95    else:
 96        output = '<{fmt}>{txt}</{fmt}>'
 97
 98    output = output.format(fmt=fmt, txt=string.strip())
 99    output += '\n' * string.split('\n').count('') * string.endswith('\n')
100    return output
101
102
103def text_link_format(text, link):
104
105    '''
106    formats links
107    '''
108
109    # convert telegram links to anchors
110    # this implies that telegram links are pointing to the same channel
111    if link.startswith('https://t.me/c/'):
112        link = '#' + link.split('/')[-1]
113    link_fmt = '[{text}]({href})'
114    link_fmt = link_fmt.format(text=text.strip(), href=link)
115    link_fmt += '\n' * text.count('\n') * text.endswith('\n')
116    return link_fmt
117
118
119def parse_text_object(obj):
120
121    '''
122    detects type of text object and wraps it in corresponding formatting
123    '''
124
125    obj_type = obj['type']
126    obj_text = obj['text']
127
128    if obj_type == 'hashtag':
129        post_tag = obj_text
130        return post_tag
131
132    elif obj_type == 'text_link':
133        return text_link_format(obj_text, obj['href'])
134
135    elif obj_type == 'link' or obj_type == 'email':
136        link = obj_text.strip()
137        link = 'https://' * (obj_type == 'link') * \
138            (1 - link.startswith('https://')) + link
139        post_link = '<{href}>'.format(href=link)
140        return post_link
141
142    elif obj_type == 'phone':
143        return obj_text
144
145    elif obj_type == 'italic':
146        return text_format(obj_text, '*')
147
148    elif obj_type == 'bold':
149        return text_format(obj_text, '**')
150
151    elif obj_type == 'code':
152        return text_format(obj_text, '`')
153
154    elif obj_type == 'pre':
155        return text_format(obj_text, '```')
156
157    elif obj_type == 'underline':
158        return text_format(obj_text, 'u')
159
160    elif obj_type == 'strikethrough':
161        return text_format(obj_text, 's')
162
163
164def parse_post_text(post):
165    # TODO: handle reply-to
166    post_raw_text = post['text']
167    post_parsed_text = ''
168
169    if type(post_raw_text) == str:
170        return str(post_raw_text)
171
172    else:
173        for obj in post_raw_text:
174            if type(obj) == str:
175                post_parsed_text += obj
176            else:
177                post_parsed_text += str(parse_text_object(obj))
178
179        return post_parsed_text
180
181
182def parse_post_media(post, media_dir):
183
184    '''
185    wraps media files into html tags
186    '''
187
188    post_media_file = os.path.basename(post['file'])
189    post_media_ext = post_media_file.split(".")[-1]
190    post_media_src = os.path.join(media_dir, post_media_file)
191
192    # audiofiles can be presented as audioplayers and other media types
193    # could be left as just links to them
194    # ???
195    post_media = '\n<audio controls>\n\
196            <source src="{src}" type="{mime_type}">\n\
197            </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
198
199    return post_media
200
201
202def parse_post_file(post, media_dir):
203
204    '''
205    wrap files into link tags
206    '''
207
208    post_file_src = os.path.basename(post['file'])
209    post_file_ext = post_file_src.split('.')[-1]
210    post_file_name = post_file_src.removesuffix('.' + post_file_ext)
211
212    post_file = f'\n\n[{post_file_name}]({post_file_src})\n\n'
213
214    return post_file
215
216def parse_post(post, photo_dir, media_dir):
217
218    '''
219    converts post object to formatted text
220    '''
221
222    post_output = ''
223
224    # optional image
225    # TODO: handle multiple photos in one post (maybe by comparing timestamps)
226    if 'photo' in post:
227        post_output += str(parse_post_photo(post, photo_dir))
228
229    if all(['file' in post, 'thumbnail' in post]):
230        post_output += str(parse_post_photo_as_file(post, media_dir))
231
232    # post text
233    post_output += str(parse_post_text(post))
234
235    # optional media
236    if 'media_type' in post:
237        post_output += str(parse_post_media(post, media_dir))
238    elif 'file' in post and not 'thumbnail' in post:
239        post_output += str(parse_post_file(post, media_dir))
240
241    return post_output
242
243
244def main():
245
246    parser = argparse.ArgumentParser(
247            usage='%(prog)s [options] json_file',
248            description='Convert exported Telegram channel data json to \
249                    bunch of markdown posts ready to use with jekyll')
250    parser.add_argument(
251            'json', metavar='json_file',
252            help='result.json file from telegram export')
253    parser.add_argument(
254            '--out-dir', metavar='out_dir',
255            nargs='?', default='formatted_posts',
256            help='output directory for markdown files\
257                    (default: formatted_posts)')
258    parser.add_argument(
259            '--photo-dir', metavar='photo_dir',
260            nargs='?', default='photos',
261            help='location of image files. this changes only links\
262                    to photos in markdown text, so specify your\
263                    desired location (default: photos)')
264    parser.add_argument(
265            '--media-dir', metavar='media_dir',
266            nargs='?', default='files',
267            help='location of media files. this changes only links\
268                    to files in markdown text, so specify your \
269                    desired location (default: files)')
270    args_wip = parser.add_argument_group('work in progress')
271    args_wip.add_argument(
272            '--post-header', metavar='post_header',
273            nargs='?',
274            help='yaml front matter for your posts \
275                    (now doesn\'t work)')
276
277    args = parser.parse_args()
278
279    try:
280        os.mkdir(args.out_dir)
281    except FileExistsError:
282        pass
283
284    # load json file
285    try:
286        with open(args.json, 'r', encoding='utf-8') as f:
287            data = json.load(f)
288    except FileNotFoundError:
289        sys.exit('result.json not found.\nPlease, specify right file')
290
291    # load only messages
292    raw_posts = data['messages']
293
294    for post in raw_posts:
295        # TODO: handle forwarded posts
296        if post['type'] == 'message' and 'forwarded_from' not in post:
297
298            post_date = datetime.fromisoformat(post['date'])
299            post_id = post['id']
300            post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
301            post_path = os.path.join(args.out_dir, post_filename)
302
303            with open(post_path, 'w', encoding='utf-8') as f:
304                print(print_default_post_header(post_id, post_date, None),
305                      file=f)
306                print(parse_post(post, args.photo_dir, args.media_dir), file=f)
307
308
309if __name__ == '__main__':
310    main()
all repos — tg2md @ 856df0ec5823ae12682fe4c4bcd4c15a062be1dc

converter from telegram json to jekyll md