parse.py (view raw)
1#!/usr/bin/env python
2
3# This script parses output from Telegram channel and converts each post to
4# jekyll-applicable post in markdown.
5#
6# Telegram creates result.json file, as well as different directories containing
7# multimedia, photos, etc. This script creates new directory and populates it
8# with formatted posts ready to publish.
9#
10# TODO summary:
11# - replies
12# - single/muliple tags
13# - forwarded posts
14
15import os
16# import sys
17import json
18from datetime import datetime
19
20# post:
21# header
22# [photo?]
23# text
24# [media?]
25
26# text:
27# [str|list(str|obj, ...)]
28
29def print_post_header(post_title, post_date, post_tag):
30 # TODO: handle post tag/tags
31 post_header = '---\ntitle: {title}\ndate: {date}\n\
32tag: {tag}\nlayout: post\n---\n'.format(\
33 title=post_title, date=post_date, tag=post_tag)
34
35 return post_header
36
37def parse_post_photo(post):
38 post_photo = '\n\n'.format(src=post['photo'])
39
40 return post_photo
41
42def md_str(string):
43 string = string.replace('\n','\n\n')
44 string = string.replace('. ', '.\n')
45
46 return string
47
48
49def parse_text_object(obj):
50 '''
51 Parse object from post text.
52
53 Objects are text links, plain links, underlined text, strikethrough text,
54 italic text, bold text, code blocks and hashtags.
55
56 This is a mess, but what is better?
57 '''
58
59 obj_type = obj['type']
60 obj_text = obj['text']
61
62 if obj_type == 'hashtag':
63 post_tag = obj_text
64 return post_tag
65
66 elif obj_type == 'text_link':
67 post_text_link = '[{text}]({href})'.format(text=obj_text, \
68 href=obj['href'])
69 return post_text_link
70
71 elif obj_type == 'link':
72 post_link = '[link]({href})'.format(href=obj_text)
73 return post_link
74
75 # I dunno how this appeared, but it seems like hyphenated numbers
76 # are treated as phone numbers, so return them as plain text.
77 elif obj_type == 'phone':
78 return obj_text
79
80 elif obj_type == 'bold':
81 post_inline_bold = '**{text}**'.format(text=obj_text)
82 return post_inline_bold
83
84 elif obj_type == 'italic':
85 post_inline_italic = '*{text}*'.format(text=obj_text)
86 return post_inline_italic
87
88 elif obj_type == 'underline':
89 post_inline_underline = '<u>{text}</u>'.format(text=obj_text)
90 return post_inline_underline
91
92 elif obj_type == 'strikethrough':
93 post_inline_strike = '<s>{text}</s>'.format(text=obj_text)
94 return post_inline_strike
95
96 elif obj_type == 'code':
97 post_inline_code = '```\n{text}\n```'.format(text=obj_text)
98 return post_inline_code
99
100
101def parse_post_text(post):
102 # TODO: handle reply-to
103 post_raw_text = post['text']
104 post_parsed_text = ''
105
106 if type(post_raw_text) == str:
107 return str(post_raw_text)
108
109 else:
110 for obj in post_raw_text:
111 if type(obj) == str:
112 post_parsed_text += obj
113 else:
114 post_parsed_text += str(parse_text_object(obj))
115
116 return post_parsed_text
117
118
119def parse_post_media(post):
120 post_media = '<audio controls>\n \
121 <source src="{src}" type="{mime_type}">\n \
122 </audio>'.format(src=post['file'], mime_type=post['mime_type'])
123
124 return post_media
125
126
127def parse_post(post):
128 post_output = ''
129
130 # optional image
131 if 'photo' in post:
132 post_output += str(parse_post_photo(post))
133
134 # post text
135 post_output += md_str(parse_post_text(post))
136
137 # optional media
138 if 'media_type' in post:
139 post_output += str(parse_post_media(post))
140
141 return post_output
142
143
144def main():
145 # create output directory
146 out_dir = './formatted_posts'
147 try:
148 os.mkdir(out_dir)
149 except FileExistsError as e:
150 pass
151
152 # load json file
153 with open('result.json', 'r') as f:
154 data = json.load(f)
155
156 # load only messages
157 raw_posts = data['messages']
158
159 for post in raw_posts:
160 # TODO: handle forwarded posts
161 if post['type'] == 'message' and 'forwarded_from' not in post:
162
163 post_date = datetime.fromisoformat(post['date'])
164 post_id = post['id']
165 post_filename = out_dir + '/' + str(post_date.date()) + '-' \
166 + str(post_id) + '.md'
167
168 with open (post_filename, 'w') as f:
169 print(print_post_header(
170 post_id, post_date.date(), None),
171 file=f)
172 print(parse_post(post), file=f)
173
174
175if __name__ == '__main__':
176 main()
177