all repos — tg2md @ 4c57d96e545b73e0423566f22695179f58341abf

converter from telegram json to jekyll md

add argparse

and do some refactoring
la-ninpre leobrekalini@gmail.com
Thu, 10 Dec 2020 20:09:50 +0300
commit

4c57d96e545b73e0423566f22695179f58341abf

parent

63d6cfec84bfa750604734e269d3c1c9e3c1fc04

1 files changed, 111 insertions(+), 45 deletions(-)

jump to
M tg2md.pytg2md.py

@@ -11,37 +11,61 @@ # - custom post header

import os import sys +import argparse import json from datetime import datetime -def print_post_header(post_title, post_date, post_tag): +def print_default_post_header(post_title, post_date, post_tag): + + ''' + returns default post header + ''' + # TODO: handle post tag/tags # TODO: support for custom header - post_header = '---\ntitle: {title}\ndate: {date}\n\ -tag: {tag}\nlayout: post\n---\n'.format(\ - title=post_title, date=post_date, tag=post_tag) + post_header = '---\n'\ + 'title: {title}\n'\ + 'date: {date}\n'\ + 'tags: {tag}\n'\ + 'layout: post\n'\ + '---\n'.format(title=post_title, date=post_date, tag=post_tag) return post_header -def parse_post_photo(post, media_dir): - post_photo_src = post['photo'][7:] - post_photo_src = media_dir + '/' + post_photo_src - post_photo = '![image]({src})\n\n'.format(\ - src=post_photo_src) +def print_custom_post_header(post_header_file, *args): - return post_photo + ''' + now unusable (i dunno how it may work) + ''' + with post_header_file as f: + post_header_content = read(post_header_file) + for arg in args: + pass + return post_header_content -# def md_str(string): - # string = string.replace('\n','\n\n') - # string = string.replace('. ', '.\n') - # return string +def parse_post_photo(post, photo_dir): + + ''' + converts photo tag to markdown image link + ''' + + post_photo_src = os.path.basename(post['photo']) + post_photo_src = os.path.join(photo_dir, post_photo_src) + post_photo = '![image]({src})\n\n'.format(src=post_photo_src) + + return post_photo def text_format(string, fmt): + + ''' + wraps string in markdown-styled formatting + ''' + if fmt in ('*', '**', '***', '`', '```'): output = '{fmt}{txt}{fmt}' elif fmt == '```':

@@ -53,7 +77,13 @@ output = output.format(fmt=fmt, txt=string.strip())

output += '\n' * string.split('\n').count('') * string.endswith('\n') return output + def text_link_format(text, link): + + ''' + formats links + ''' + # convert telegram links to anchors # this implies that telegram links are pointing to the same channel if link.startswith('https://t.me/c/'):

@@ -66,6 +96,10 @@

def parse_text_object(obj): + ''' + detects type of text object and wraps it in corresponding formatting + ''' + obj_type = obj['type'] obj_text = obj['text']

@@ -79,7 +113,7 @@

elif obj_type == 'link' or obj_type == 'email': link = obj_text.strip() link = 'https://' * (obj_type == 'link') * \ - (1 - link.startswith('https://')) + link + (1 - link.startswith('https://')) + link post_link = '<{href}>'.format(href=link) return post_link

@@ -124,23 +158,32 @@ return post_parsed_text

def parse_post_media(post, media_dir): + + ''' + wraps file links into html tags + ''' + # get filename without parent directory - post_media_src = post['file'][post['file'].rfind("/") + 1:] + post_media_src = os.path.basename(post['file']) # add parent directory - post_media_src = media_dir + '/' + post_media_src + post_media_src = os.path.join(media_dir, post_media_src) post_media = '\n<audio controls>\n \ <source src="{src}" type="{mime_type}">\n \ </audio>'.format(src=post_media_src, mime_type=post['mime_type']) return post_media - + + +def parse_post(post, photo_dir, media_dir): -def parse_post(post): + ''' + converts post object to formatted text + ''' + post_output = '' - + # optional image - photo_dir = '/photos' if 'photo' in post: post_output += str(parse_post_photo(post, photo_dir))

@@ -148,7 +191,6 @@ # post text

post_output += str(parse_post_text(post)) # optional media - media_dir = '/files' if 'media_type' in post: post_output += str(parse_post_media(post, media_dir))

@@ -156,47 +198,71 @@ return post_output

def main(): - # try directory from first argument - try: - input_dir = sys.argv[1] - except IndexError as e: - # if it's not specified, use current directory - input_dir = '.' - # create output directory - out_dir = input_dir + '/' + 'formatted_posts' + parser = argparse.ArgumentParser( + usage='%(prog)s [options] json_file', + description='Convert exported Telegram channel data json to \ + bunch of markdown posts ready to use with jekyll') + parser.add_argument( + 'json', metavar='json_file', + type=open, + help='result.json file from telegram export') + parser.add_argument( + '--out-dir', metavar='out_dir', + nargs='?', default='formatted_posts', + help='output directory for markdown files\ + (default: formatted_posts)') + parser.add_argument( + '--photo-dir', metavar='photo_dir', + nargs='?', default='photos', + help='location of image files. this changes only links\ + to photos in markdown text, so specify your\ + desired location (default: photos)') + parser.add_argument( + '--media-dir', metavar='media_dir', + nargs='?', default='files', + help='location of media files. this changes only links\ + to files in markdown text, so specify your \ + desired location (default: files)') + args_wip = parser.add_argument_group('work in progress') + args_wip.add_argument( + '--post-header', metavar='post_header', + nargs='?', + help='yaml front matter for your posts \ + (now doesn\'t work)') + + args = parser.parse_args() + try: - os.mkdir(out_dir) - except FileExistsError as e: + os.mkdir(args.out_dir) + except FileExistsError: pass # load json file - json_path = input_dir + '/' + 'result.json' try: - with open(json_path, 'r') as f: + with args.json as f: data = json.load(f) - except FileNotFoundError as e: - sys.exit('result.json not found.\nPlease, specify right directory') + except FileNotFoundError: + sys.exit('result.json not found.\nPlease, specify right file') # load only messages raw_posts = data['messages'] for post in raw_posts: - # TODO: handle forwarded posts + # TODO: handle forwarded posts if post['type'] == 'message' and 'forwarded_from' not in post: post_date = datetime.fromisoformat(post['date']) post_id = post['id'] - post_filename = out_dir + '/' + str(post_date.date()) + '-' \ - + str(post_id) + '.md' + post_filename = str(post_date.date()) + '-' + str(post_id) + '.md' + post_path = os.path.join(args.out_dir, post_filename) - with open (post_filename, 'w') as f: - print(print_post_header( - post_id, post_date, None), - file=f) - print(parse_post(post), file=f) + with open(post_path, 'w') as f: + print(print_default_post_header( + post_id, post_date, None), + file=f) + print(parse_post(post, args.photo_dir, args.media_dir), file=f) if __name__ == '__main__': main() -