From 48d5e233fda099780ab6705505f4154b0cefa11b Mon Sep 17 00:00:00 2001 From: la-ninpre Date: Thu, 26 Nov 2020 23:56:43 +0300 Subject: make a rename notice repo name also changed --- README.md | 3 +- parse.py | 195 ----------------------------------------------------------- tg2md.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 196 deletions(-) delete mode 100644 parse.py create mode 100644 tg2md.py diff --git a/README.md b/README.md index 2eb604e..959d9b3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# telegram2md +# tg2md This script parses output from Telegram channel and converts each post to jekyll-applicable post in markdown. @@ -37,6 +37,7 @@ and `files`, you should change `photo_dir` and `media_dir` variables. ## todo's +- use argparse module instead of crap - replies - single/muliple tags - forwarded posts diff --git a/parse.py b/parse.py deleted file mode 100644 index a19cf0b..0000000 --- a/parse.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python - -# parse.py - converts telegram json to jekyll md. -# Copyright (c) 2020, Lev Brekalov - -# TODO summary: -# - replies -# - single/muliple tags -# - forwarded posts -# - custom post header - -import os -import sys -import json -from datetime import datetime - - -def print_post_header(post_title, post_date, post_tag): - # TODO: handle post tag/tags - # TODO: support for custom header - post_header = '---\ntitle: {title}\ndate: {date}\n\ -tag: {tag}\nlayout: post\n---\n'.format(\ - title=post_title, date=post_date, tag=post_tag) - - return post_header - - -def parse_post_photo(post, media_dir): - post_photo_src = post['photo'][7:] - post_photo_src = media_dir + '/' + post_photo_src - post_photo = '![image]({src})\n\n'.format(\ - src=post_photo_src) - - return post_photo - - -# def md_str(string): - # string = string.replace('\n','\n\n') - # string = string.replace('. ', '.\n') - - # return string - - -def text_format(string, fmt): - if fmt in ('*', '**', '***', '`', '```'): - output = '{fmt}{txt}{fmt}' - elif fmt == '```': - output = '{fmt}\n{txt}\n{fmt}' - else: - output = '<{fmt}>{txt}' - - output = output.format(fmt=fmt, txt=string.strip()) - output += '\n' * string.split('\n').count('') * string.endswith('\n') - return output - -def text_link_format(text, link): - link_fmt = '[{text}]({href})' - link_fmt = link_fmt.format(text=text.strip(), href=link) - link_fmt += '\n' * text.count('\n') * text.endswith('\n') - return link_fmt - - -def parse_text_object(obj): - - obj_type = obj['type'] - obj_text = obj['text'] - - if obj_type == 'hashtag': - post_tag = obj_text - return post_tag - - elif obj_type == 'text_link': - return text_link_format(obj_text, obj['href']) - - elif obj_type == 'link' or obj_type == 'email': - post_link = '<{href}>'.format(href=obj_text.strip()) - return post_link - - elif obj_type == 'phone': - return obj_text - - elif obj_type == 'italic': - return text_format(obj_text, '*') - - elif obj_type == 'bold': - return text_format(obj_text, '**') - - elif obj_type == 'code': - return text_format(obj_text, '`') - - elif obj_type == 'pre': - return text_format(obj_text, '```') - - elif obj_type == 'underline': - return text_format(obj_text, 'u') - - elif obj_type == 'strikethrough': - return text_format(obj_text, 's') - - -def parse_post_text(post): - # TODO: handle reply-to - post_raw_text = post['text'] - post_parsed_text = '' - - if type(post_raw_text) == str: - return str(post_raw_text) - - else: - for obj in post_raw_text: - if type(obj) == str: - post_parsed_text += obj - else: - post_parsed_text += str(parse_text_object(obj)) - - return post_parsed_text - - -def parse_post_media(post, media_dir): - # get filename without parent directory - post_media_src = post['file'][post['file'].rfind("/") + 1:] - - # add parent directory - post_media_src = media_dir + '/' + post_media_src - post_media = '\n'.format(src=post_media_src, mime_type=post['mime_type']) - - return post_media - - -def parse_post(post): - post_output = '' - - # optional image - photo_dir = '/photos' - if 'photo' in post: - post_output += str(parse_post_photo(post, photo_dir)) - - # post text - post_output += str(parse_post_text(post)) - - # optional media - media_dir = '/files' - if 'media_type' in post: - post_output += str(parse_post_media(post, media_dir)) - - return post_output - - -def main(): - # try directory from first argument - try: - input_dir = sys.argv[1] - except IndexError as e: - # if it's not specified, use current directory - input_dir = '.' - - # create output directory - out_dir = input_dir + '/' + 'formatted_posts' - try: - os.mkdir(out_dir) - except FileExistsError as e: - pass - - # load json file - json_path = input_dir + '/' + 'result.json' - try: - with open(json_path, 'r') as f: - data = json.load(f) - except FileNotFoundError as e: - sys.exit('result.json not found.\nPlease, specify right directory') - - # load only messages - raw_posts = data['messages'] - - for post in raw_posts: - # TODO: handle forwarded posts - if post['type'] == 'message' and 'forwarded_from' not in post: - - post_date = datetime.fromisoformat(post['date']) - post_id = post['id'] - post_filename = out_dir + '/' + str(post_date.date()) + '-' \ - + str(post_id) + '.md' - - with open (post_filename, 'w') as f: - print(print_post_header( - post_id, post_date, None), - file=f) - print(parse_post(post), file=f) - - -if __name__ == '__main__': - main() - diff --git a/tg2md.py b/tg2md.py new file mode 100644 index 0000000..ca4ef35 --- /dev/null +++ b/tg2md.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +# parse.py - converts telegram json to jekyll md. +# Copyright (c) 2020, Lev Brekalov + +# TODO summary: +# - replies +# - single/muliple tags +# - forwarded posts +# - custom post header + +import os +import sys +import json +from datetime import datetime + + +def print_post_header(post_title, post_date, post_tag): + # TODO: handle post tag/tags + # TODO: support for custom header + post_header = '---\ntitle: {title}\ndate: {date}\n\ +tag: {tag}\nlayout: post\n---\n'.format(\ + title=post_title, date=post_date, tag=post_tag) + + return post_header + + +def parse_post_photo(post, media_dir): + post_photo_src = post['photo'][7:] + post_photo_src = media_dir + '/' + post_photo_src + post_photo = '![image]({src})\n\n'.format(\ + src=post_photo_src) + + return post_photo + + +# def md_str(string): + # string = string.replace('\n','\n\n') + # string = string.replace('. ', '.\n') + + # return string + + +def text_format(string, fmt): + if fmt in ('*', '**', '***', '`', '```'): + output = '{fmt}{txt}{fmt}' + elif fmt == '```': + output = '{fmt}\n{txt}\n{fmt}' + else: + output = '<{fmt}>{txt}' + + output = output.format(fmt=fmt, txt=string.strip()) + output += '\n' * string.split('\n').count('') * string.endswith('\n') + return output + +def text_link_format(text, link): + # convert telegram links to anchors + # this implies that telegram links are pointing to the same channel + if link.startswith('https://t.me/c/'): + link = '#' + link.split('/')[-1] + link_fmt = '[{text}]({href})' + link_fmt = link_fmt.format(text=text.strip(), href=link) + link_fmt += '\n' * text.count('\n') * text.endswith('\n') + return link_fmt + + +def parse_text_object(obj): + + obj_type = obj['type'] + obj_text = obj['text'] + + if obj_type == 'hashtag': + post_tag = obj_text + return post_tag + + elif obj_type == 'text_link': + return text_link_format(obj_text, obj['href']) + + elif obj_type == 'link' or obj_type == 'email': + link = obj_text.strip() + link = 'https://' * (obj_type == 'link') * \ + (1 - link.startswith('https://')) + link + post_link = '<{href}>'.format(href=link) + return post_link + + elif obj_type == 'phone': + return obj_text + + elif obj_type == 'italic': + return text_format(obj_text, '*') + + elif obj_type == 'bold': + return text_format(obj_text, '**') + + elif obj_type == 'code': + return text_format(obj_text, '`') + + elif obj_type == 'pre': + return text_format(obj_text, '```') + + elif obj_type == 'underline': + return text_format(obj_text, 'u') + + elif obj_type == 'strikethrough': + return text_format(obj_text, 's') + + +def parse_post_text(post): + # TODO: handle reply-to + post_raw_text = post['text'] + post_parsed_text = '' + + if type(post_raw_text) == str: + return str(post_raw_text) + + else: + for obj in post_raw_text: + if type(obj) == str: + post_parsed_text += obj + else: + post_parsed_text += str(parse_text_object(obj)) + + return post_parsed_text + + +def parse_post_media(post, media_dir): + # get filename without parent directory + post_media_src = post['file'][post['file'].rfind("/") + 1:] + + # add parent directory + post_media_src = media_dir + '/' + post_media_src + post_media = '\n'.format(src=post_media_src, mime_type=post['mime_type']) + + return post_media + + +def parse_post(post): + post_output = '' + + # optional image + photo_dir = '/photos' + if 'photo' in post: + post_output += str(parse_post_photo(post, photo_dir)) + + # post text + post_output += str(parse_post_text(post)) + + # optional media + media_dir = '/files' + if 'media_type' in post: + post_output += str(parse_post_media(post, media_dir)) + + return post_output + + +def main(): + # try directory from first argument + try: + input_dir = sys.argv[1] + except IndexError as e: + # if it's not specified, use current directory + input_dir = '.' + + # create output directory + out_dir = input_dir + '/' + 'formatted_posts' + try: + os.mkdir(out_dir) + except FileExistsError as e: + pass + + # load json file + json_path = input_dir + '/' + 'result.json' + try: + with open(json_path, 'r') as f: + data = json.load(f) + except FileNotFoundError as e: + sys.exit('result.json not found.\nPlease, specify right directory') + + # load only messages + raw_posts = data['messages'] + + for post in raw_posts: + # TODO: handle forwarded posts + if post['type'] == 'message' and 'forwarded_from' not in post: + + post_date = datetime.fromisoformat(post['date']) + post_id = post['id'] + post_filename = out_dir + '/' + str(post_date.date()) + '-' \ + + str(post_id) + '.md' + + with open (post_filename, 'w') as f: + print(print_post_header( + post_id, post_date, None), + file=f) + print(parse_post(post), file=f) + + +if __name__ == '__main__': + main() + -- cgit v1.2.3