aboutsummaryrefslogtreecommitdiffstats
path: root/tg2md.py
diff options
context:
space:
mode:
Diffstat (limited to 'tg2md.py')
-rw-r--r--tg2md.py202
1 files changed, 202 insertions, 0 deletions
diff --git a/tg2md.py b/tg2md.py
new file mode 100644
index 0000000..ca4ef35
--- /dev/null
+++ b/tg2md.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+# parse.py - converts telegram json to jekyll md.
+# Copyright (c) 2020, Lev Brekalov
+
+# TODO summary:
+# - replies
+# - single/muliple tags
+# - forwarded posts
+# - custom post header
+
+import os
+import sys
+import json
+from datetime import datetime
+
+
+def print_post_header(post_title, post_date, post_tag):
+ # TODO: handle post tag/tags
+ # TODO: support for custom header
+ post_header = '---\ntitle: {title}\ndate: {date}\n\
+tag: {tag}\nlayout: post\n---\n'.format(\
+ title=post_title, date=post_date, tag=post_tag)
+
+ return post_header
+
+
+def parse_post_photo(post, media_dir):
+ post_photo_src = post['photo'][7:]
+ post_photo_src = media_dir + '/' + post_photo_src
+ post_photo = '![image]({src})\n\n'.format(\
+ src=post_photo_src)
+
+ return post_photo
+
+
+# def md_str(string):
+ # string = string.replace('\n','\n\n')
+ # string = string.replace('. ', '.\n')
+
+ # return string
+
+
+def text_format(string, fmt):
+ if fmt in ('*', '**', '***', '`', '```'):
+ output = '{fmt}{txt}{fmt}'
+ elif fmt == '```':
+ output = '{fmt}\n{txt}\n{fmt}'
+ else:
+ output = '<{fmt}>{txt}</{fmt}>'
+
+ output = output.format(fmt=fmt, txt=string.strip())
+ output += '\n' * string.split('\n').count('') * string.endswith('\n')
+ return output
+
+def text_link_format(text, link):
+ # convert telegram links to anchors
+ # this implies that telegram links are pointing to the same channel
+ if link.startswith('https://t.me/c/'):
+ link = '#' + link.split('/')[-1]
+ link_fmt = '[{text}]({href})'
+ link_fmt = link_fmt.format(text=text.strip(), href=link)
+ link_fmt += '\n' * text.count('\n') * text.endswith('\n')
+ return link_fmt
+
+
+def parse_text_object(obj):
+
+ obj_type = obj['type']
+ obj_text = obj['text']
+
+ if obj_type == 'hashtag':
+ post_tag = obj_text
+ return post_tag
+
+ elif obj_type == 'text_link':
+ return text_link_format(obj_text, obj['href'])
+
+ elif obj_type == 'link' or obj_type == 'email':
+ link = obj_text.strip()
+ link = 'https://' * (obj_type == 'link') * \
+ (1 - link.startswith('https://')) + link
+ post_link = '<{href}>'.format(href=link)
+ return post_link
+
+ elif obj_type == 'phone':
+ return obj_text
+
+ elif obj_type == 'italic':
+ return text_format(obj_text, '*')
+
+ elif obj_type == 'bold':
+ return text_format(obj_text, '**')
+
+ elif obj_type == 'code':
+ return text_format(obj_text, '`')
+
+ elif obj_type == 'pre':
+ return text_format(obj_text, '```')
+
+ elif obj_type == 'underline':
+ return text_format(obj_text, 'u')
+
+ elif obj_type == 'strikethrough':
+ return text_format(obj_text, 's')
+
+
+def parse_post_text(post):
+ # TODO: handle reply-to
+ post_raw_text = post['text']
+ post_parsed_text = ''
+
+ if type(post_raw_text) == str:
+ return str(post_raw_text)
+
+ else:
+ for obj in post_raw_text:
+ if type(obj) == str:
+ post_parsed_text += obj
+ else:
+ post_parsed_text += str(parse_text_object(obj))
+
+ return post_parsed_text
+
+
+def parse_post_media(post, media_dir):
+ # get filename without parent directory
+ post_media_src = post['file'][post['file'].rfind("/") + 1:]
+
+ # add parent directory
+ post_media_src = media_dir + '/' + post_media_src
+ post_media = '\n<audio controls>\n \
+ <source src="{src}" type="{mime_type}">\n \
+ </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
+
+ return post_media
+
+
+def parse_post(post):
+ post_output = ''
+
+ # optional image
+ photo_dir = '/photos'
+ if 'photo' in post:
+ post_output += str(parse_post_photo(post, photo_dir))
+
+ # post text
+ post_output += str(parse_post_text(post))
+
+ # optional media
+ media_dir = '/files'
+ if 'media_type' in post:
+ post_output += str(parse_post_media(post, media_dir))
+
+ return post_output
+
+
+def main():
+ # try directory from first argument
+ try:
+ input_dir = sys.argv[1]
+ except IndexError as e:
+ # if it's not specified, use current directory
+ input_dir = '.'
+
+ # create output directory
+ out_dir = input_dir + '/' + 'formatted_posts'
+ try:
+ os.mkdir(out_dir)
+ except FileExistsError as e:
+ pass
+
+ # load json file
+ json_path = input_dir + '/' + 'result.json'
+ try:
+ with open(json_path, 'r') as f:
+ data = json.load(f)
+ except FileNotFoundError as e:
+ sys.exit('result.json not found.\nPlease, specify right directory')
+
+ # load only messages
+ raw_posts = data['messages']
+
+ for post in raw_posts:
+ # TODO: handle forwarded posts
+ if post['type'] == 'message' and 'forwarded_from' not in post:
+
+ post_date = datetime.fromisoformat(post['date'])
+ post_id = post['id']
+ post_filename = out_dir + '/' + str(post_date.date()) + '-' \
+ + str(post_id) + '.md'
+
+ with open (post_filename, 'w') as f:
+ print(print_post_header(
+ post_id, post_date, None),
+ file=f)
+ print(parse_post(post), file=f)
+
+
+if __name__ == '__main__':
+ main()
+