tg2md.py (view raw)
1#!/usr/bin/env python
2
3# parse.py - converts telegram json to jekyll md.
4# Copyright (c) 2020, Lev Brekalov
5
6# TODO summary:
7# - replies
8# - single/muliple tags
9# - forwarded posts
10# - custom post header
11# - multiple photos in one post
12
13import os
14import argparse
15import json
16from datetime import datetime
17
18
19def print_default_post_header(post_title, post_date, post_tag):
20
21 '''
22 returns default post header
23 '''
24
25 # TODO: handle post tag/tags
26 # TODO: support for custom header
27 post_header = '---\n'\
28 'title: {title}\n'\
29 'date: {date}\n'\
30 'tags: {tag}\n'\
31 'layout: post\n'\
32 '---\n'.format(title=post_title, date=post_date, tag=post_tag)
33
34 return post_header
35
36
37def print_custom_post_header(post_header_file, *args):
38
39 '''
40 now unusable (i dunno how it may work)
41 '''
42
43 with post_header_file as f:
44 post_header_content = read(post_header_file)
45 for arg in args:
46 pass
47 return post_header_content
48
49
50def parse_post_photo(post, photo_dir):
51
52 '''
53 converts photo tag to markdown image link
54 '''
55
56 post_photo_src = os.path.basename(post['photo'])
57 post_photo_src = os.path.join(photo_dir, post_photo_src)
58 post_photo = '\n\n'.format(src=post_photo_src)
59
60 return post_photo
61
62
63def parse_post_photo_as_file(post, media_dir):
64
65 '''
66 converts file tag with thumbnail to image and a link
67 '''
68
69 # links to files are currently broken, because these files are
70 # going to `files` directory, not `photos`.
71 # need to track down any files with thumbnails and then to move them
72 # to a photos directory.
73 post_photo_file_src = os.path.basename(post['file'])
74 post_photo_file_src = os.path.join(media_dir, post_photo_file_src)
75 post_photo_thumbnail_src = os.path.basename(post['thumbnail'])
76 post_photo_thumbnail_src = os.path.join(media_dir,
77 post_photo_thumbnail_src)
78
79 post_photo_as_file = '\n[full size]({file})\n\n'\
80 .format(thumb=post_photo_thumbnail_src, file=post_photo_file_src)
81
82 return post_photo_as_file
83
84
85def text_format(string, fmt):
86
87 '''
88 wraps string in markdown-styled formatting
89 '''
90
91 if fmt in ('*', '**', '***', '`', '```'):
92 output = '{fmt}{txt}{fmt}'
93 elif fmt == '```':
94 output = '{fmt}\n{txt}\n{fmt}'
95 else:
96 output = '<{fmt}>{txt}</{fmt}>'
97
98 output = output.format(fmt=fmt, txt=string.strip())
99 output += '\n' * string.split('\n').count('') * string.endswith('\n')
100 return output
101
102
103def text_link_format(text, link):
104
105 '''
106 formats links
107 '''
108
109 # convert telegram links to anchors
110 # this implies that telegram links are pointing to the same channel
111 if link.startswith('https://t.me/c/'):
112 link = '#' + link.split('/')[-1]
113 link_fmt = '[{text}]({href})'
114 link_fmt = link_fmt.format(text=text.strip(), href=link)
115 link_fmt += '\n' * text.count('\n') * text.endswith('\n')
116 return link_fmt
117
118
119def parse_text_object(obj):
120
121 '''
122 detects type of text object and wraps it in corresponding formatting
123 '''
124
125 obj_type = obj['type']
126 obj_text = obj['text']
127
128 if obj_type == 'hashtag':
129 post_tag = obj_text
130 return post_tag
131
132 elif obj_type == 'text_link':
133 return text_link_format(obj_text, obj['href'])
134
135 elif obj_type == 'link' or obj_type == 'email':
136 link = obj_text.strip()
137 link = 'https://' * (obj_type == 'link') * \
138 (1 - link.startswith('https://')) + link
139 post_link = '<{href}>'.format(href=link)
140 return post_link
141
142 elif obj_type == 'phone':
143 return obj_text
144
145 elif obj_type == 'italic':
146 return text_format(obj_text, '*')
147
148 elif obj_type == 'bold':
149 return text_format(obj_text, '**')
150
151 elif obj_type == 'code':
152 return text_format(obj_text, '`')
153
154 elif obj_type == 'pre':
155 return text_format(obj_text, '```')
156
157 elif obj_type == 'underline':
158 return text_format(obj_text, 'u')
159
160 elif obj_type == 'strikethrough':
161 return text_format(obj_text, 's')
162
163
164def parse_post_text(post):
165 # TODO: handle reply-to
166 post_raw_text = post['text']
167 post_parsed_text = ''
168
169 if type(post_raw_text) == str:
170 return str(post_raw_text)
171
172 else:
173 for obj in post_raw_text:
174 if type(obj) == str:
175 post_parsed_text += obj
176 else:
177 post_parsed_text += str(parse_text_object(obj))
178
179 return post_parsed_text
180
181
182def parse_post_media(post, media_dir):
183
184 '''
185 wraps media files into html tags
186 '''
187
188 post_media_file = os.path.basename(post['file'])
189 post_media_ext = post_media_file.split(".")[-1]
190 post_media_src = os.path.join(media_dir, post_media_file)
191
192 # audiofiles can be presented as audioplayers and other media types
193 # could be left as just links to them
194 # ???
195 post_media = '\n<audio controls>\n\
196 <source src="{src}" type="{mime_type}">\n\
197 </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
198
199 return post_media
200
201
202def parse_post_file(post, media_dir):
203
204 '''
205 wrap files into link tags
206 '''
207
208 post_file_src = os.path.basename(post['file'])
209 post_file_ext = post_file_src.split('.')[-1]
210 post_file_name = post_file_src.removesuffix('.' + post_file_ext)
211
212 post_file = f'\n\n[{post_file_name}]({post_file_src})\n\n'
213
214 return post_file
215
216def parse_post(post, photo_dir, media_dir):
217
218 '''
219 converts post object to formatted text
220 '''
221
222 post_output = ''
223
224 # optional image
225 # TODO: handle multiple photos in one post (maybe by comparing timestamps)
226 if 'photo' in post:
227 post_output += str(parse_post_photo(post, photo_dir))
228
229 if all(['file' in post, 'thumbnail' in post]):
230 post_output += str(parse_post_photo_as_file(post, media_dir))
231
232 # post text
233 post_output += str(parse_post_text(post))
234
235 # optional media
236 if 'media_type' in post:
237 post_output += str(parse_post_media(post, media_dir))
238 elif 'file' in post and not 'thumbnail' in post:
239 post_output += str(parse_post_file(post, media_dir))
240
241 return post_output
242
243
244def main():
245
246 parser = argparse.ArgumentParser(
247 usage='%(prog)s [options] json_file',
248 description='Convert exported Telegram channel data json to \
249 bunch of markdown posts ready to use with jekyll')
250 parser.add_argument(
251 'json', metavar='json_file',
252 help='result.json file from telegram export')
253 parser.add_argument(
254 '--out-dir', metavar='out_dir',
255 nargs='?', default='formatted_posts',
256 help='output directory for markdown files\
257 (default: formatted_posts)')
258 parser.add_argument(
259 '--photo-dir', metavar='photo_dir',
260 nargs='?', default='photos',
261 help='location of image files. this changes only links\
262 to photos in markdown text, so specify your\
263 desired location (default: photos)')
264 parser.add_argument(
265 '--media-dir', metavar='media_dir',
266 nargs='?', default='files',
267 help='location of media files. this changes only links\
268 to files in markdown text, so specify your \
269 desired location (default: files)')
270 args_wip = parser.add_argument_group('work in progress')
271 args_wip.add_argument(
272 '--post-header', metavar='post_header',
273 nargs='?',
274 help='yaml front matter for your posts \
275 (now doesn\'t work)')
276
277 args = parser.parse_args()
278
279 try:
280 os.mkdir(args.out_dir)
281 except FileExistsError:
282 pass
283
284 # load json file
285 try:
286 with open(args.json, 'r', encoding='utf-8') as f:
287 data = json.load(f)
288 except FileNotFoundError:
289 sys.exit('result.json not found.\nPlease, specify right file')
290
291 # load only messages
292 raw_posts = data['messages']
293
294 for post in raw_posts:
295 # TODO: handle forwarded posts
296 if post['type'] == 'message' and 'forwarded_from' not in post:
297
298 post_date = datetime.fromisoformat(post['date'])
299 post_id = post['id']
300 post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
301 post_path = os.path.join(args.out_dir, post_filename)
302
303 with open(post_path, 'w', encoding='utf-8') as f:
304 print(print_default_post_header(post_id, post_date, None),
305 file=f)
306 print(parse_post(post, args.photo_dir, args.media_dir), file=f)
307
308
309if __name__ == '__main__':
310 main()