tg2md.py (view raw)
1#!/usr/bin/env python
2
3# parse.py - converts telegram json to jekyll md.
4# Copyright (c) 2020, Lev Brekalov
5
6# TODO summary:
7# - replies
8# - single/muliple tags
9# - forwarded posts
10# - custom post header
11
12import os
13import argparse
14import json
15from datetime import datetime
16
17def print_default_post_header(post_title, post_date, post_tag):
18
19
20 '''
21 returns default post header
22 '''
23
24 # TODO: handle post tag/tags
25 # TODO: support for custom header
26 post_header = '---\n'\
27 'title: {title}\n'\
28 'date: {date}\n'\
29 'tags: {tag}\n'\
30 'layout: post\n'\
31 '---\n'.format(title=post_title, date=post_date, tag=post_tag)
32
33 return post_header
34
35
36def print_custom_post_header(post_header_file, *args):
37
38 '''
39 now unusable (i dunno how it may work)
40 '''
41
42 with post_header_file as f:
43 post_header_content = read(post_header_file)
44 for arg in args:
45 pass
46 return post_header_content
47
48
49def parse_post_photo(post, photo_dir):
50
51 '''
52 converts photo tag to markdown image link
53 '''
54
55 post_photo_src = os.path.basename(post['photo'])
56 post_photo_src = os.path.join(photo_dir, post_photo_src)
57 post_photo = '\n\n'.format(src=post_photo_src)
58
59 return post_photo
60
61
62def text_format(string, fmt):
63
64 '''
65 wraps string in markdown-styled formatting
66 '''
67
68 if fmt in ('*', '**', '***', '`', '```'):
69 output = '{fmt}{txt}{fmt}'
70 elif fmt == '```':
71 output = '{fmt}\n{txt}\n{fmt}'
72 else:
73 output = '<{fmt}>{txt}</{fmt}>'
74
75 output = output.format(fmt=fmt, txt=string.strip())
76 output += '\n' * string.split('\n').count('') * string.endswith('\n')
77 return output
78
79
80def text_link_format(text, link):
81
82 '''
83 formats links
84 '''
85
86 # convert telegram links to anchors
87 # this implies that telegram links are pointing to the same channel
88 if link.startswith('https://t.me/c/'):
89 link = '#' + link.split('/')[-1]
90 link_fmt = '[{text}]({href})'
91 link_fmt = link_fmt.format(text=text.strip(), href=link)
92 link_fmt += '\n' * text.count('\n') * text.endswith('\n')
93 return link_fmt
94
95
96def parse_text_object(obj):
97
98 '''
99 detects type of text object and wraps it in corresponding formatting
100 '''
101
102 obj_type = obj['type']
103 obj_text = obj['text']
104
105 if obj_type == 'hashtag':
106 post_tag = obj_text
107 return post_tag
108
109 elif obj_type == 'text_link':
110 return text_link_format(obj_text, obj['href'])
111
112 elif obj_type == 'link' or obj_type == 'email':
113 link = obj_text.strip()
114 link = 'https://' * (obj_type == 'link') * \
115 (1 - link.startswith('https://')) + link
116 post_link = '<{href}>'.format(href=link)
117 return post_link
118
119 elif obj_type == 'phone':
120 return obj_text
121
122 elif obj_type == 'italic':
123 return text_format(obj_text, '*')
124
125 elif obj_type == 'bold':
126 return text_format(obj_text, '**')
127
128 elif obj_type == 'code':
129 return text_format(obj_text, '`')
130
131 elif obj_type == 'pre':
132 return text_format(obj_text, '```')
133
134 elif obj_type == 'underline':
135 return text_format(obj_text, 'u')
136
137 elif obj_type == 'strikethrough':
138 return text_format(obj_text, 's')
139
140
141def parse_post_text(post):
142 # TODO: handle reply-to
143 post_raw_text = post['text']
144 post_parsed_text = ''
145
146 if type(post_raw_text) == str:
147 return str(post_raw_text)
148
149 else:
150 for obj in post_raw_text:
151 if type(obj) == str:
152 post_parsed_text += obj
153 else:
154 post_parsed_text += str(parse_text_object(obj))
155
156 return post_parsed_text
157
158
159def parse_post_media(post, media_dir):
160
161 '''
162 wraps file links into html tags
163 '''
164
165 # get filename without parent directory
166 post_media_src = os.path.basename(post['file'])
167
168 # add parent directory
169 post_media_src = os.path.join(media_dir, post_media_src)
170 post_media = '\n<audio controls>\n \
171 <source src="{src}" type="{mime_type}">\n \
172 </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
173
174 return post_media
175
176
177def parse_post(post, photo_dir, media_dir):
178
179 '''
180 converts post object to formatted text
181 '''
182
183 post_output = ''
184
185 # optional image
186 if 'photo' in post:
187 post_output += str(parse_post_photo(post, photo_dir))
188
189 # post text
190 post_output += str(parse_post_text(post))
191
192 # optional media
193 if 'media_type' in post:
194 post_output += str(parse_post_media(post, media_dir))
195
196 return post_output
197
198
199def main():
200
201 parser = argparse.ArgumentParser(
202 usage='%(prog)s [options] json_file',
203 description='Convert exported Telegram channel data json to \
204 bunch of markdown posts ready to use with jekyll')
205 parser.add_argument(
206 'json', metavar='json_file',
207 help='result.json file from telegram export')
208 parser.add_argument(
209 '--out-dir', metavar='out_dir',
210 nargs='?', default='formatted_posts',
211 help='output directory for markdown files\
212 (default: formatted_posts)')
213 parser.add_argument(
214 '--photo-dir', metavar='photo_dir',
215 nargs='?', default='photos',
216 help='location of image files. this changes only links\
217 to photos in markdown text, so specify your\
218 desired location (default: photos)')
219 parser.add_argument(
220 '--media-dir', metavar='media_dir',
221 nargs='?', default='files',
222 help='location of media files. this changes only links\
223 to files in markdown text, so specify your \
224 desired location (default: files)')
225 args_wip = parser.add_argument_group('work in progress')
226 args_wip.add_argument(
227 '--post-header', metavar='post_header',
228 nargs='?',
229 help='yaml front matter for your posts \
230 (now doesn\'t work)')
231
232 args = parser.parse_args()
233
234 try:
235 os.mkdir(args.out_dir)
236 except FileExistsError:
237 pass
238
239 # load json file
240 try:
241 with open(args.json, 'r', encoding='utf-8') as f:
242 data = json.load(f)
243 except FileNotFoundError:
244 sys.exit('result.json not found.\nPlease, specify right file')
245
246 # load only messages
247 raw_posts = data['messages']
248
249 for post in raw_posts:
250 # TODO: handle forwarded posts
251 if post['type'] == 'message' and 'forwarded_from' not in post:
252
253 post_date = datetime.fromisoformat(post['date'])
254 post_id = post['id']
255 post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
256 post_path = os.path.join(args.out_dir, post_filename)
257
258 with open(post_path, 'w', encoding='utf-8') as f:
259 print(print_default_post_header(
260 post_id, post_date, None), file=f)
261 print(parse_post(post, args.photo_dir, args.media_dir), file=f)
262
263
264if __name__ == '__main__':
265 main()