aaoth.xyz repos — tg2md (ab90cef7a02809b7ac044322fd140a192b1bf182): tg2md.py

tg2md.py (view raw)
  1#!/usr/bin/env python
  2
  3# parse.py - converts telegram json to jekyll md.
  4# Copyright (c) 2020, Lev Brekalov
  5
  6# TODO summary:
  7# - replies
  8# - single/muliple tags
  9# - forwarded posts
 10# - custom post header
 11# - multiple photos in one post
 12
 13import os
 14import logging
 15import argparse
 16import json
 17from datetime import datetime
 18
 19
 20def print_default_post_header(post_title, post_date, post_tag):
 21
 22    '''
 23    returns default post header
 24    '''
 25
 26    # TODO: handle post tag/tags
 27    # TODO: support for custom header
 28    post_header = '---\n'\
 29        'title: {title}\n'\
 30        'date: {date}\n'\
 31        'tags: {tag}\n'\
 32        'layout: post\n'\
 33        '---\n'.format(title=post_title, date=post_date, tag=post_tag)
 34
 35    return post_header
 36
 37
 38def print_custom_post_header(post_header_file, *args):
 39
 40    '''
 41    now unusable (i dunno how it may work)
 42    '''
 43
 44    with post_header_file as f:
 45        post_header_content = read(post_header_file)
 46    for arg in args:
 47        pass
 48    return post_header_content
 49
 50
 51def parse_post_photo(post, photo_dir):
 52
 53    '''
 54    converts photo tag to markdown image link
 55    '''
 56
 57    post_photo_src = os.path.basename(post['photo'])
 58    post_photo_src = os.path.join(photo_dir, post_photo_src)
 59    post_photo = '![image]({src})\n\n'.format(src=post_photo_src)
 60
 61    return post_photo
 62
 63
 64def parse_post_photo_as_file(post, media_dir):
 65
 66    '''
 67    converts file tag with thumbnail to image and a link
 68    '''
 69
 70    # links to files are currently broken, because these files are
 71    # going to `files` directory, not `photos`.
 72    # need to track down any files with thumbnails and then to move them
 73    # to a photos directory.
 74    post_photo_file_src = os.path.basename(post['file'])
 75    post_photo_file_src = os.path.join(media_dir, post_photo_file_src)
 76    post_photo_thumbnail_src = os.path.basename(post['thumbnail'])
 77    post_photo_thumbnail_src = os.path.join(media_dir,
 78                                            post_photo_thumbnail_src)
 79
 80    post_photo_as_file = '![image]({thumb})\n[full size]({file})\n\n'\
 81        .format(thumb=post_photo_thumbnail_src, file=post_photo_file_src)
 82
 83    return post_photo_as_file
 84
 85
 86def text_format(string, fmt):
 87
 88    '''
 89    wraps string in markdown-styled formatting
 90    '''
 91
 92    if fmt in ('*', '**', '***', '`', '```'):
 93        output = '{fmt}{txt}{fmt}'
 94    elif fmt == '```':
 95        output = '{fmt}\n{txt}\n{fmt}'
 96    else:
 97        output = '<{fmt}>{txt}</{fmt}>'
 98
 99    output = output.format(fmt=fmt, txt=string.strip())
100    output += '\n' * string.split('\n').count('') * string.endswith('\n')
101    return output
102
103
104def text_link_format(text, link):
105
106    '''
107    formats links
108    '''
109
110    # convert telegram links to anchors
111    # this implies that telegram links are pointing to the same channel
112    if link.startswith('https://t.me/c/'):
113        link = '#' + link.split('/')[-1]
114    link_fmt = '[{text}]({href})'
115    link_fmt = link_fmt.format(text=text.strip(), href=link)
116    link_fmt += '\n' * text.count('\n') * text.endswith('\n')
117    return link_fmt
118
119
120def parse_text_object(obj):
121
122    '''
123    detects type of text object and wraps it in corresponding formatting
124    '''
125
126    obj_type = obj['type']
127    obj_text = obj['text']
128
129    if obj_type == 'hashtag':
130        post_tag = obj_text
131        return post_tag
132
133    elif obj_type == 'text_link':
134        return text_link_format(obj_text, obj['href'])
135
136    elif obj_type == 'link' or obj_type == 'email':
137        link = obj_text.strip()
138        link = 'https://' * (obj_type == 'link') * \
139            (1 - link.startswith('https://')) + link
140        post_link = '<{href}>'.format(href=link)
141        return post_link
142
143    elif obj_type == 'phone':
144        return obj_text
145
146    elif obj_type == 'italic':
147        return text_format(obj_text, '*')
148
149    elif obj_type == 'bold':
150        return text_format(obj_text, '**')
151
152    elif obj_type == 'code':
153        return text_format(obj_text, '`')
154
155    elif obj_type == 'pre':
156        return text_format(obj_text, '```')
157
158    elif obj_type == 'underline':
159        return text_format(obj_text, 'u')
160
161    elif obj_type == 'strikethrough':
162        return text_format(obj_text, 's')
163
164
165def parse_post_text(post):
166    # TODO: handle reply-to
167    post_raw_text = post['text']
168    post_parsed_text = ''
169
170    if type(post_raw_text) == str:
171        return str(post_raw_text)
172
173    else:
174        for obj in post_raw_text:
175            if type(obj) == str:
176                post_parsed_text += obj
177            else:
178                post_parsed_text += str(parse_text_object(obj))
179
180        return post_parsed_text
181
182
183def parse_post_media(post, media_dir):
184
185    '''
186    wraps media files into html tags
187    '''
188
189    post_media_file = os.path.basename(post['file'])
190    post_media_ext = post_media_file.split(".")[-1]
191    post_media_src = os.path.join(media_dir, post_media_file)
192
193    # audiofiles can be presented as audioplayers and other media types
194    # could be left as just links to them
195    # ???
196    post_media = '\n<audio controls>\n\
197            <source src="{src}" type="{mime_type}">\n\
198            </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
199
200    return post_media
201
202
203def parse_post_file(post, media_dir):
204
205    '''
206    wrap files into link tags
207    '''
208
209    post_file_src = os.path.basename(post['file'])
210    post_file_ext = post_file_src.split('.')[-1]
211    post_file_name = post_file_src.removesuffix('.' + post_file_ext)
212
213    post_file = f'\n\n[{post_file_name}]({post_file_src})\n\n'
214
215    return post_file
216
217def parse_post(post, photo_dir, media_dir):
218
219    '''
220    converts post object to formatted text
221    '''
222
223    post_output = ''
224
225    # optional image
226    # TODO: handle multiple photos in one post (maybe by comparing timestamps)
227    if 'photo' in post:
228        post_output += str(parse_post_photo(post, photo_dir))
229
230    if all(['file' in post, 'thumbnail' in post]):
231        post_output += str(parse_post_photo_as_file(post, media_dir))
232
233    # post text
234    post_output += str(parse_post_text(post))
235
236    # optional media
237    if 'media_type' in post:
238        post_output += str(parse_post_media(post, media_dir))
239    elif 'file' in post and not 'thumbnail' in post:
240        post_output += str(parse_post_file(post, media_dir))
241
242    return post_output
243
244
245def main():
246
247    parser = argparse.ArgumentParser(
248            usage='%(prog)s [options] json_file',
249            description='Convert exported Telegram channel data json to \
250                    bunch of markdown posts ready to use with jekyll')
251    parser.add_argument(
252            'json', metavar='json_file',
253            help='result.json file from telegram export')
254    parser.add_argument(
255            '-o', '--out-dir', metavar='out_dir',
256            nargs='?', default='formatted_posts',
257            help='output directory for markdown files\
258                    (default: formatted_posts)')
259    parser.add_argument(
260            '-p', '--photo-dir', metavar='photo_dir',
261            nargs='?', default='photos',
262            help='location of image files. this changes only links\
263                    to photos in markdown text, so specify your\
264                    desired location (default: photos)')
265    parser.add_argument(
266            '-m', '--media-dir', metavar='media_dir',
267            nargs='?', default='files',
268            help='location of media files. this changes only links\
269                    to files in markdown text, so specify your \
270                    desired location (default: files)')
271    args_wip = parser.add_argument_group('work in progress')
272    args_wip.add_argument(
273            '-H', '--post-header', metavar='post_header',
274            nargs='?',
275            help='yaml front matter for your posts \
276                    (now doesn\'t work)')
277
278    args = parser.parse_args()
279
280    try:
281        os.mkdir(args.out_dir)
282    except OSError as error:
283        logging.warning(error)
284
285    # load json file
286    try:
287        with open(args.json, 'r', encoding='utf-8') as f:
288            data = json.load(f)
289    except FileNotFoundError:
290        sys.exit('result.json not found.\nPlease, specify right file')
291
292    # load only messages
293    raw_posts = data['messages']
294
295    for post in raw_posts:
296        # TODO: handle forwarded posts
297        if post['type'] == 'message' and 'forwarded_from' not in post:
298
299            post_date = datetime.fromisoformat(post['date'])
300            post_id = post['id']
301            post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
302            post_path = os.path.join(args.out_dir, post_filename)
303
304            with open(post_path, 'w', encoding='utf-8') as f:
305                print(print_default_post_header(post_id, post_date, None),
306                      file=f)
307                print(parse_post(post, args.photo_dir, args.media_dir), file=f)
308
309
310if __name__ == '__main__':
311    main()
all repos — tg2md @ ab90cef7a02809b7ac044322fd140a192b1bf182

converter from telegram json to jekyll md