aaoth.xyz repos — tg2md (c1aea08b1c64a79f724a04edc55d8d9d8e28e263): tg2md.py

tg2md.py (view raw)
  1#!/usr/bin/env python
  2
  3# parse.py - converts telegram json to jekyll md.
  4# Copyright (c) 2020, Lev Brekalov
  5
  6# TODO summary:
  7# - replies
  8# - single/muliple tags
  9# - forwarded posts
 10# - custom post header
 11
 12import os
 13import argparse
 14import json
 15from datetime import datetime
 16
 17def print_default_post_header(post_title, post_date, post_tag):
 18
 19
 20    '''
 21    returns default post header
 22    '''
 23
 24    # TODO: handle post tag/tags
 25    # TODO: support for custom header
 26    post_header = '---\n'\
 27        'title: {title}\n'\
 28        'date: {date}\n'\
 29        'tags: {tag}\n'\
 30        'layout: post\n'\
 31        '---\n'.format(title=post_title, date=post_date, tag=post_tag)
 32
 33    return post_header
 34
 35
 36def print_custom_post_header(post_header_file, *args):
 37
 38    '''
 39    now unusable (i dunno how it may work)
 40    '''
 41
 42    with post_header_file as f:
 43        post_header_content = read(post_header_file)
 44    for arg in args:
 45        pass
 46    return post_header_content
 47
 48
 49def parse_post_photo(post, photo_dir):
 50
 51    '''
 52    converts photo tag to markdown image link
 53    '''
 54
 55    post_photo_src = os.path.basename(post['photo'])
 56    post_photo_src = os.path.join(photo_dir, post_photo_src)
 57    post_photo = '![image]({src})\n\n'.format(src=post_photo_src)
 58
 59    return post_photo
 60
 61
 62def text_format(string, fmt):
 63
 64    '''
 65    wraps string in markdown-styled formatting
 66    '''
 67
 68    if fmt in ('*', '**', '***', '`', '```'):
 69        output = '{fmt}{txt}{fmt}'
 70    elif fmt == '```':
 71        output = '{fmt}\n{txt}\n{fmt}'
 72    else:
 73        output = '<{fmt}>{txt}</{fmt}>'
 74
 75    output = output.format(fmt=fmt, txt=string.strip())
 76    output += '\n' * string.split('\n').count('') * string.endswith('\n')
 77    return output
 78
 79
 80def text_link_format(text, link):
 81
 82    '''
 83    formats links
 84    '''
 85
 86    # convert telegram links to anchors
 87    # this implies that telegram links are pointing to the same channel
 88    if link.startswith('https://t.me/c/'):
 89        link = '#' + link.split('/')[-1]
 90    link_fmt = '[{text}]({href})'
 91    link_fmt = link_fmt.format(text=text.strip(), href=link)
 92    link_fmt += '\n' * text.count('\n') * text.endswith('\n')
 93    return link_fmt
 94
 95
 96def parse_text_object(obj):
 97
 98    '''
 99    detects type of text object and wraps it in corresponding formatting
100    '''
101
102    obj_type = obj['type']
103    obj_text = obj['text']
104
105    if obj_type == 'hashtag':
106        post_tag = obj_text
107        return post_tag
108
109    elif obj_type == 'text_link':
110        return text_link_format(obj_text, obj['href'])
111
112    elif obj_type == 'link' or obj_type == 'email':
113        link = obj_text.strip()
114        link = 'https://' * (obj_type == 'link') * \
115            (1 - link.startswith('https://')) + link
116        post_link = '<{href}>'.format(href=link)
117        return post_link
118
119    elif obj_type == 'phone':
120        return obj_text
121
122    elif obj_type == 'italic':
123        return text_format(obj_text, '*')
124
125    elif obj_type == 'bold':
126        return text_format(obj_text, '**')
127
128    elif obj_type == 'code':
129        return text_format(obj_text, '`')
130
131    elif obj_type == 'pre':
132        return text_format(obj_text, '```')
133
134    elif obj_type == 'underline':
135        return text_format(obj_text, 'u')
136
137    elif obj_type == 'strikethrough':
138        return text_format(obj_text, 's')
139
140
141def parse_post_text(post):
142    # TODO: handle reply-to
143    post_raw_text = post['text']
144    post_parsed_text = ''
145
146    if type(post_raw_text) == str:
147        return str(post_raw_text)
148
149    else:
150        for obj in post_raw_text:
151            if type(obj) == str:
152                post_parsed_text += obj
153            else:
154                post_parsed_text += str(parse_text_object(obj))
155
156        return post_parsed_text
157
158
159def parse_post_media(post, media_dir):
160
161    '''
162    wraps file links into html tags
163    '''
164
165    # get filename without parent directory
166    post_media_src = os.path.basename(post['file'])
167
168    # add parent directory
169    post_media_src = os.path.join(media_dir, post_media_src)
170    post_media = '\n<audio controls>\n \
171        <source src="{src}" type="{mime_type}">\n \
172        </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
173
174    return post_media
175
176
177def parse_post(post, photo_dir, media_dir):
178
179    '''
180    converts post object to formatted text
181    '''
182
183    post_output = ''
184
185    # optional image
186    if 'photo' in post:
187        post_output += str(parse_post_photo(post, photo_dir))
188
189    # post text
190    post_output += str(parse_post_text(post))
191
192    # optional media
193    if 'media_type' in post:
194        post_output += str(parse_post_media(post, media_dir))
195
196    return post_output
197
198
199def main():
200
201    parser = argparse.ArgumentParser(
202            usage='%(prog)s [options] json_file',
203            description='Convert exported Telegram channel data json to \
204                    bunch of markdown posts ready to use with jekyll')
205    parser.add_argument(
206            'json', metavar='json_file',
207            help='result.json file from telegram export')
208    parser.add_argument(
209            '--out-dir', metavar='out_dir',
210            nargs='?', default='formatted_posts',
211            help='output directory for markdown files\
212                    (default: formatted_posts)')
213    parser.add_argument(
214            '--photo-dir', metavar='photo_dir',
215            nargs='?', default='photos',
216            help='location of image files. this changes only links\
217                    to photos in markdown text, so specify your\
218                    desired location (default: photos)')
219    parser.add_argument(
220            '--media-dir', metavar='media_dir',
221            nargs='?', default='files',
222            help='location of media files. this changes only links\
223                    to files in markdown text, so specify your \
224                    desired location (default: files)')
225    args_wip = parser.add_argument_group('work in progress')
226    args_wip.add_argument(
227            '--post-header', metavar='post_header',
228            nargs='?',
229            help='yaml front matter for your posts \
230                    (now doesn\'t work)')
231
232    args = parser.parse_args()
233
234    try:
235        os.mkdir(args.out_dir)
236    except FileExistsError:
237        pass
238
239    # load json file
240    try:
241        with open(args.json, 'r', encoding='utf-8') as f:
242            data = json.load(f)
243    except FileNotFoundError:
244        sys.exit('result.json not found.\nPlease, specify right file')
245
246    # load only messages
247    raw_posts = data['messages']
248
249    for post in raw_posts:
250        # TODO: handle forwarded posts
251        if post['type'] == 'message' and 'forwarded_from' not in post:
252
253            post_date = datetime.fromisoformat(post['date'])
254            post_id = post['id']
255            post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
256            post_path = os.path.join(args.out_dir, post_filename)
257
258            with open(post_path, 'w', encoding='utf-8') as f:
259                print(print_default_post_header(
260                    post_id, post_date, None), file=f)
261                print(parse_post(post, args.photo_dir, args.media_dir), file=f)
262
263
264if __name__ == '__main__':
265    main()
all repos — tg2md @ c1aea08b1c64a79f724a04edc55d8d9d8e28e263

converter from telegram json to jekyll md