#!/usr/bin/env python
# This script parses output from Telegram channel and converts each post to
# jekyll-applicable post in markdown.
#
# Telegram creates result.json file, as well as different directories containing
# multimedia, photos, etc. This script creates new directory and populates it
# with formatted posts ready to publish.
#
# TODO summary:
# - replies
# - single/muliple tags
# - forwarded posts
import os
# import sys
import json
from datetime import datetime
# post:
# header
# [photo?]
# text
# [media?]
# text:
# [str|list(str|obj, ...)]
def print_post_header(post_title, post_date, post_tag):
# TODO: handle post tag/tags
post_header = '---\ntitle: {title}\ndate: {date}\n\
tag: {tag}\nlayout: post\n---\n'.format(\
title=post_title, date=post_date, tag=post_tag)
return post_header
def parse_post_photo(post):
post_photo = '\n\n'.format(src=post['photo'])
return post_photo
def md_str(string):
string = string.replace('\n','\n\n')
string = string.replace('. ', '.\n')
return string
def parse_text_object(obj):
'''
Parse object from post text.
Objects are text links, plain links, underlined text, strikethrough text,
italic text, bold text, code blocks and hashtags.
This is a mess, but what is better?
'''
obj_type = obj['type']
obj_text = obj['text']
if obj_type == 'hashtag':
post_tag = obj_text
return post_tag
elif obj_type == 'text_link':
post_text_link = '[{text}]({href})'.format(text=obj_text, \
href=obj['href'])
return post_text_link
elif obj_type == 'link':
post_link = '[link]({href})'.format(href=obj_text)
return post_link
# I dunno how this appeared, but it seems like hyphenated numbers
# are treated as phone numbers, so return them as plain text.
elif obj_type == 'phone':
return obj_text
elif obj_type == 'bold':
post_inline_bold = '**{text}**'.format(text=obj_text)
return post_inline_bold
elif obj_type == 'italic':
post_inline_italic = '*{text}*'.format(text=obj_text)
return post_inline_italic
elif obj_type == 'underline':
post_inline_underline = '{text}'.format(text=obj_text)
return post_inline_underline
elif obj_type == 'strikethrough':
post_inline_strike = '{text}'.format(text=obj_text)
return post_inline_strike
elif obj_type == 'code':
post_inline_code = '```\n{text}\n```'.format(text=obj_text)
return post_inline_code
def parse_post_text(post):
# TODO: handle reply-to
post_raw_text = post['text']
post_parsed_text = ''
if type(post_raw_text) == str:
return str(post_raw_text)
else:
for obj in post_raw_text:
if type(obj) == str:
post_parsed_text += obj
else:
post_parsed_text += str(parse_text_object(obj))
return post_parsed_text
def parse_post_media(post):
post_media = ''.format(src=post['file'], mime_type=post['mime_type'])
return post_media
def parse_post(post):
post_output = ''
# optional image
if 'photo' in post:
post_output += str(parse_post_photo(post))
# post text
post_output += md_str(parse_post_text(post))
# optional media
if 'media_type' in post:
post_output += str(parse_post_media(post))
return post_output
def main():
# create output directory
out_dir = './formatted_posts'
try:
os.mkdir(out_dir)
except FileExistsError as e:
pass
# load json file
with open('result.json', 'r') as f:
data = json.load(f)
# load only messages
raw_posts = data['messages']
for post in raw_posts:
# TODO: handle forwarded posts
if post['type'] == 'message' and 'forwarded_from' not in post:
post_date = datetime.fromisoformat(post['date'])
post_id = post['id']
post_filename = out_dir + '/' + str(post_date.date()) + '-' \
+ str(post_id) + '.md'
with open (post_filename, 'w') as f:
print(print_post_header(
post_id, post_date.date(), None),
file=f)
print(parse_post(post), file=f)
if __name__ == '__main__':
main()