parse.py (view raw)
1#!/usr/bin/env python
2
3# This script parses output from Telegram channel and converts each post to
4# jekyll-applicable post in markdown.
5#
6# Telegram creates result.json file, as well as different directories containing
7# multimedia, photos, etc. This script creates new directory and populates it
8# with formatted posts ready to publish.
9#
10# TODO summary:
11# - replies
12# - single/muliple tags
13# - forwarded posts
14# - custom post header
15
16import os
17import sys
18import json
19from datetime import datetime
20
21
22def print_post_header(post_title, post_date, post_tag):
23 # TODO: handle post tag/tags
24 # TODO: support for custom header
25 post_header = '---\ntitle: {title}\ndate: {date}\n\
26tag: {tag}\nlayout: post\n---\n'.format(\
27 title=post_title, date=post_date, tag=post_tag)
28
29 return post_header
30
31
32def parse_post_photo(post, media_dir):
33 post_photo_src = post['photo'][7:]
34 post_photo_src = media_dir + '/' + post_photo_src
35 post_photo = '\n\n'.format(\
36 src=post_photo_src)
37
38 return post_photo
39
40
41# def md_str(string):
42 # string = string.replace('\n','\n\n')
43 # string = string.replace('. ', '.\n')
44
45 # return string
46
47
48def text_format(string, fmt):
49 if fmt in ('*', '**', '***', '`', '```'):
50 output = '{fmt}{txt}{fmt}'
51 elif fmt == '```':
52 output = '{fmt}\n{txt}\n{fmt}'
53 else:
54 output = '<{fmt}>{txt}</{fmt}>'
55
56 output = output.format(fmt=fmt, txt=string.strip())
57 output += '\n' * string.split('\n').count('') * string.endswith('\n')
58 return output
59
60def text_link_format(text, link):
61 link_fmt = '[{text}]({href})'
62 link_fmt = link_fmt.format(text=text.strip(), href=link)
63 link_fmt += '\n' * text.count('\n') * text.endswith('\n')
64 return link_fmt
65
66
67def parse_text_object(obj):
68
69 obj_type = obj['type']
70 obj_text = obj['text']
71
72 if obj_type == 'hashtag':
73 post_tag = obj_text
74 return post_tag
75
76 elif obj_type == 'text_link':
77 return text_link_format(obj_text, obj['href'])
78
79 elif obj_type == 'link' or obj_type == 'email':
80 post_link = '<{href}>'.format(href=obj_text.strip())
81 return post_link
82
83 elif obj_type == 'phone':
84 return obj_text
85
86 elif obj_type == 'italic':
87 return text_format(obj_text, '*')
88
89 elif obj_type == 'bold':
90 return text_format(obj_text, '**')
91
92 elif obj_type == 'code':
93 return text_format(obj_text, '`')
94
95 elif obj_type == 'pre':
96 return text_format(obj_text, '```')
97
98 elif obj_type == 'underline':
99 return text_format(obj_text, 'u')
100
101 elif obj_type == 'strikethrough':
102 return text_format(obj_text, 's')
103
104
105def parse_post_text(post):
106 # TODO: handle reply-to
107 post_raw_text = post['text']
108 post_parsed_text = ''
109
110 if type(post_raw_text) == str:
111 return str(post_raw_text)
112
113 else:
114 for obj in post_raw_text:
115 if type(obj) == str:
116 post_parsed_text += obj
117 else:
118 post_parsed_text += str(parse_text_object(obj))
119
120 return post_parsed_text
121
122
123def parse_post_media(post, media_dir):
124 # get filename without parent directory
125 post_media_src = post['file'][post['file'].rfind("/") + 1:]
126
127 # add parent directory
128 post_media_src = media_dir + '/' + post_media_src
129 post_media = '\n<audio controls>\n \
130 <source src="{src}" type="{mime_type}">\n \
131 </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
132
133 return post_media
134
135
136def parse_post(post):
137 post_output = ''
138
139 # optional image
140 photo_dir = '/photos'
141 if 'photo' in post:
142 post_output += str(parse_post_photo(post, photo_dir))
143
144 # post text
145 post_output += str(parse_post_text(post))
146
147 # optional media
148 media_dir = '/files'
149 if 'media_type' in post:
150 post_output += str(parse_post_media(post, media_dir))
151
152 return post_output
153
154
155def main():
156 # try directory from first argument
157 try:
158 input_dir = sys.argv[1]
159 except IndexError as e:
160 # if it's not specified, use current directory
161 input_dir = '.'
162
163 # create output directory
164 out_dir = input_dir + '/' + 'formatted_posts'
165 try:
166 os.mkdir(out_dir)
167 except FileExistsError as e:
168 pass
169
170 # load json file
171 json_path = input_dir + '/' + 'result.json'
172 try:
173 with open(json_path, 'r') as f:
174 data = json.load(f)
175 except FileNotFoundError as e:
176 sys.exit('result.json not found.\nPlease, specify right directory')
177
178 # load only messages
179 raw_posts = data['messages']
180
181 for post in raw_posts:
182 # TODO: handle forwarded posts
183 if post['type'] == 'message' and 'forwarded_from' not in post:
184
185 post_date = datetime.fromisoformat(post['date'])
186 post_id = post['id']
187 post_filename = out_dir + '/' + str(post_date.date()) + '-' \
188 + str(post_id) + '.md'
189
190 with open (post_filename, 'w') as f:
191 print(print_post_header(
192 post_id, post_date, None),
193 file=f)
194 print(parse_post(post), file=f)
195
196
197if __name__ == '__main__':
198 main()
199