tg2md.py (view raw)
1#!/usr/bin/env python
2
3# parse.py - converts telegram json to jekyll md.
4# Copyright (c) 2020, Lev Brekalov
5
6# TODO summary:
7# - replies
8# - single/muliple tags
9# - forwarded posts
10# - custom post header
11# - multiple photos in one post
12
13import os
14import logging
15import argparse
16import json
17from datetime import datetime
18
19
20def print_default_post_header(post_title, post_date, post_tag):
21
22 '''
23 returns default post header
24 '''
25
26 # TODO: handle post tag/tags
27 # TODO: support for custom header
28 post_header = '---\n'\
29 'title: {title}\n'\
30 'date: {date}\n'\
31 'tags: {tag}\n'\
32 'layout: post\n'\
33 '---\n'.format(title=post_title, date=post_date, tag=post_tag)
34
35 return post_header
36
37
38def print_custom_post_header(post_header_file, *args):
39
40 '''
41 now unusable (i dunno how it may work)
42 '''
43
44 with post_header_file as f:
45 post_header_content = read(post_header_file)
46 for arg in args:
47 pass
48 return post_header_content
49
50
51def parse_post_photo(post, photo_dir):
52
53 '''
54 converts photo tag to markdown image link
55 '''
56
57 post_photo_src = os.path.basename(post['photo'])
58 post_photo_src = os.path.join(photo_dir, post_photo_src)
59 post_photo = '\n\n'.format(src=post_photo_src)
60
61 return post_photo
62
63
64def parse_post_photo_as_file(post, media_dir):
65
66 '''
67 converts file tag with thumbnail to image and a link
68 '''
69
70 # links to files are currently broken, because these files are
71 # going to `files` directory, not `photos`.
72 # need to track down any files with thumbnails and then to move them
73 # to a photos directory.
74 post_photo_file_src = os.path.basename(post['file'])
75 post_photo_file_src = os.path.join(media_dir, post_photo_file_src)
76 post_photo_thumbnail_src = os.path.basename(post['thumbnail'])
77 post_photo_thumbnail_src = os.path.join(media_dir,
78 post_photo_thumbnail_src)
79
80 post_photo_as_file = '\n[full size]({file})\n\n'\
81 .format(thumb=post_photo_thumbnail_src, file=post_photo_file_src)
82
83 return post_photo_as_file
84
85
86def text_format(string, fmt):
87
88 '''
89 wraps string in markdown-styled formatting
90 '''
91
92 if fmt in ('*', '**', '***', '`', '```'):
93 output = '{fmt}{txt}{fmt}'
94 elif fmt == '```':
95 output = '{fmt}\n{txt}\n{fmt}'
96 else:
97 output = '<{fmt}>{txt}</{fmt}>'
98
99 output = output.format(fmt=fmt, txt=string.strip())
100 output += '\n' * string.split('\n').count('') * string.endswith('\n')
101 return output
102
103
104def text_link_format(text, link):
105
106 '''
107 formats links
108 '''
109
110 # convert telegram links to anchors
111 # this implies that telegram links are pointing to the same channel
112 if link.startswith('https://t.me/c/'):
113 link = '#' + link.split('/')[-1]
114 link_fmt = '[{text}]({href})'
115 link_fmt = link_fmt.format(text=text.strip(), href=link)
116 link_fmt += '\n' * text.count('\n') * text.endswith('\n')
117 return link_fmt
118
119
120def parse_text_object(obj):
121
122 '''
123 detects type of text object and wraps it in corresponding formatting
124 '''
125
126 obj_type = obj['type']
127 obj_text = obj['text']
128
129 if obj_type == 'hashtag':
130 post_tag = obj_text
131 return post_tag
132
133 elif obj_type == 'text_link':
134 return text_link_format(obj_text, obj['href'])
135
136 elif obj_type == 'link' or obj_type == 'email':
137 link = obj_text.strip()
138 link = 'https://' * (obj_type == 'link') * \
139 (1 - link.startswith('https://')) + link
140 post_link = '<{href}>'.format(href=link)
141 return post_link
142
143 elif obj_type == 'phone':
144 return obj_text
145
146 elif obj_type == 'italic':
147 return text_format(obj_text, '*')
148
149 elif obj_type == 'bold':
150 return text_format(obj_text, '**')
151
152 elif obj_type == 'code':
153 return text_format(obj_text, '`')
154
155 elif obj_type == 'pre':
156 return text_format(obj_text, '```')
157
158 elif obj_type == 'underline':
159 return text_format(obj_text, 'u')
160
161 elif obj_type == 'strikethrough':
162 return text_format(obj_text, 's')
163
164
165def parse_post_text(post):
166 # TODO: handle reply-to
167 post_raw_text = post['text']
168 post_parsed_text = ''
169
170 if type(post_raw_text) == str:
171 return str(post_raw_text)
172
173 else:
174 for obj in post_raw_text:
175 if type(obj) == str:
176 post_parsed_text += obj
177 else:
178 post_parsed_text += str(parse_text_object(obj))
179
180 return post_parsed_text
181
182
183def parse_post_media(post, media_dir):
184
185 '''
186 wraps media files into html tags
187 '''
188
189 post_media_file = os.path.basename(post['file'])
190 post_media_ext = post_media_file.split(".")[-1]
191 post_media_src = os.path.join(media_dir, post_media_file)
192
193 # audiofiles can be presented as audioplayers and other media types
194 # could be left as just links to them
195 # ???
196 post_media = '\n<audio controls>\n\
197 <source src="{src}" type="{mime_type}">\n\
198 </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
199
200 return post_media
201
202
203def parse_post_file(post, media_dir):
204
205 '''
206 wrap files into link tags
207 '''
208
209 post_file_src = os.path.basename(post['file'])
210 post_file_ext = post_file_src.split('.')[-1]
211 post_file_name = post_file_src.removesuffix('.' + post_file_ext)
212
213 post_file = f'\n\n[{post_file_name}]({post_file_src})\n\n'
214
215 return post_file
216
217def parse_post(post, photo_dir, media_dir):
218
219 '''
220 converts post object to formatted text
221 '''
222
223 post_output = ''
224
225 # optional image
226 # TODO: handle multiple photos in one post (maybe by comparing timestamps)
227 if 'photo' in post:
228 post_output += str(parse_post_photo(post, photo_dir))
229
230 if all(['file' in post, 'thumbnail' in post]):
231 post_output += str(parse_post_photo_as_file(post, media_dir))
232
233 # post text
234 post_output += str(parse_post_text(post))
235
236 # optional media
237 if 'media_type' in post:
238 post_output += str(parse_post_media(post, media_dir))
239 elif 'file' in post and not 'thumbnail' in post:
240 post_output += str(parse_post_file(post, media_dir))
241
242 return post_output
243
244
245def main():
246
247 parser = argparse.ArgumentParser(
248 usage='%(prog)s [options] json_file',
249 description='Convert exported Telegram channel data json to \
250 bunch of markdown posts ready to use with jekyll')
251 parser.add_argument(
252 'json', metavar='json_file',
253 help='result.json file from telegram export')
254 parser.add_argument(
255 '-o', '--out-dir', metavar='out_dir',
256 nargs='?', default='formatted_posts',
257 help='output directory for markdown files\
258 (default: formatted_posts)')
259 parser.add_argument(
260 '-p', '--photo-dir', metavar='photo_dir',
261 nargs='?', default='photos',
262 help='location of image files. this changes only links\
263 to photos in markdown text, so specify your\
264 desired location (default: photos)')
265 parser.add_argument(
266 '-m', '--media-dir', metavar='media_dir',
267 nargs='?', default='files',
268 help='location of media files. this changes only links\
269 to files in markdown text, so specify your \
270 desired location (default: files)')
271 args_wip = parser.add_argument_group('work in progress')
272 args_wip.add_argument(
273 '-H', '--post-header', metavar='post_header',
274 nargs='?',
275 help='yaml front matter for your posts \
276 (now doesn\'t work)')
277
278 args = parser.parse_args()
279
280 try:
281 os.mkdir(args.out_dir)
282 except OSError as error:
283 logging.warning(error)
284
285 # load json file
286 try:
287 with open(args.json, 'r', encoding='utf-8') as f:
288 data = json.load(f)
289 except FileNotFoundError:
290 sys.exit('result.json not found.\nPlease, specify right file')
291
292 # load only messages
293 raw_posts = data['messages']
294
295 for post in raw_posts:
296 # TODO: handle forwarded posts
297 if post['type'] == 'message' and 'forwarded_from' not in post:
298
299 post_date = datetime.fromisoformat(post['date'])
300 post_id = post['id']
301 post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
302 post_path = os.path.join(args.out_dir, post_filename)
303
304 with open(post_path, 'w', encoding='utf-8') as f:
305 print(print_default_post_header(post_id, post_date, None),
306 file=f)
307 print(parse_post(post, args.photo_dir, args.media_dir), file=f)
308
309
310if __name__ == '__main__':
311 main()