all repos — tg2md @ 767dc7f6954a66e187e5eefc36c21868048e04fc

converter from telegram json to jekyll md

add tests

i've found some issues with formatted text, this could be seen in output
(which is not included now). there are some complications with parsing
newline characters, but i'll fix it later.

also script is now usable like command-line application. you can specify
the directory in which source files are located.

to make some tests you need to run
$ python parse.py tests

this will create directory tests/formatted_posts, where markdown posts
will be populated.

if directory is not specified, it assumes that script is in the source
    folder. if needed json file could not be found, it exits with code
    1.
la-ninpre leobrekalini@gmail.com
Thu, 12 Nov 2020 22:38:08 +0300
commit

767dc7f6954a66e187e5eefc36c21868048e04fc

parent

c11899c33dbd1dbf0865c9d99ccea95d9922df0b

M parse.pyparse.py

@@ -13,7 +13,7 @@ # - single/muliple tags

# - forwarded posts import os -# import sys +import sys import json from datetime import datetime

@@ -36,8 +36,8 @@ return post_header

def parse_post_photo(post, media_dir): post_photo_src = post['photo'][7:] - post_photo_src = media_dir + post_photo_src - post_photo = '![image](/assets/img/posts/{src})\n\n'.format(\ + post_photo_src = media_dir + '/' + post_photo_src + post_photo = '![image]({src})\n\n'.format(\ src=post_photo_src) return post_photo

@@ -80,23 +80,26 @@ # are treated as phone numbers, so return them as plain text.

elif obj_type == 'phone': return obj_text + # output = '*{str}*'.format(str=string.strip()) + # output += '\n' * string.count('\n') * string.endswith('\n') + elif obj_type == 'bold': - post_inline_bold = '**{text}**'.format(text=obj_text) + post_inline_bold = '**{text}**'.format(text=obj_text.strip()) return post_inline_bold elif obj_type == 'italic': - post_inline_italic = '*{text}*'.format(text=obj_text) + post_inline_italic = '*{text}*'.format(text=obj_text.strip()) return post_inline_italic elif obj_type == 'underline': - post_inline_underline = '<u>{text}</u>'.format(text=obj_text) + post_inline_underline = '<u>{text}</u>'.format(text=obj_text.strip()) return post_inline_underline elif obj_type == 'strikethrough': - post_inline_strike = '<s>{text}</s>'.format(text=obj_text) + post_inline_strike = '<s>{text}</s>'.format(text=obj_text.strip()) return post_inline_strike - elif obj_type == 'code': + elif obj_type == 'code' or obj_type == 'pre': post_inline_code = '```\n{text}\n```'.format(text=obj_text) return post_inline_code

@@ -124,7 +127,7 @@ # get filename without parent directory

post_media_src = post['file'][post['file'].rfind("/") + 1:] # add parent directory - post_media_src = media_dir + post_media_src + post_media_src = media_dir + '/' + post_media_src post_media = '\n<audio controls>\n \ <source src="{src}" type="{mime_type}">\n \ </audio>'.format(src=post_media_src, mime_type=post['mime_type'])

@@ -136,15 +139,15 @@ def parse_post(post):

post_output = '' # optional image - photo_dir = '/assets/img/posts/' + photo_dir = '/photos' if 'photo' in post: post_output += str(parse_post_photo(post, photo_dir)) # post text - post_output += md_str(parse_post_text(post)) + post_output += str(parse_post_text(post)) # optional media - media_dir = '/assets/sound/posts/' + media_dir = '/files' if 'media_type' in post: post_output += str(parse_post_media(post, media_dir))

@@ -152,16 +155,27 @@ return post_output

def main(): + # try directory from first argument + try: + input_dir = sys.argv[1] + except IndexError as e: + # if it's not specified, use current directory + input_dir = '.' + # create output directory - out_dir = './formatted_posts' + out_dir = input_dir + '/' + 'formatted_posts' try: os.mkdir(out_dir) except FileExistsError as e: pass # load json file - with open('result.json', 'r') as f: - data = json.load(f) + json_path = input_dir + '/' + 'result.json' + try: + with open(json_path, 'r') as f: + data = json.load(f) + except FileNotFoundError as e: + sys.exit('result.json not found.\nPlease, specify right directory') # load only messages raw_posts = data['messages']
A tests/result.json

@@ -0,0 +1,214 @@

+{ + "name": "test", + "type": "private_channel", + "id": 9882011936, + "messages": [ + { + "id": 1, + "type": "service", + "date": "2020-11-12T12:53:52", + "actor": "test", + "actor_id": 9882011936, + "action": "create_channel", + "title": "test", + "text": "" + }, + { + "id": 2, + "type": "message", + "date": "2020-11-12T12:54:07", + "from": "test", + "from_id": 9882011936, + "text": "test text post" + }, + { + "id": 3, + "type": "message", + "date": "2020-11-12T12:57:31", + "from": "test", + "from_id": 9882011936, + "photo": "photos/photo_1@12-11-2020_12-57-31.jpg", + "width": 801, + "height": 526, + "text": "" + }, + { + "id": 4, + "type": "message", + "date": "2020-11-12T12:57:40", + "from": "test", + "from_id": 9882011936, + "photo": "photos/photo_1@12-11-2020_12-57-31.jpg", + "width": 801, + "height": 526, + "text": "photo with text" + }, + { + "id": 5, + "type": "message", + "date": "2020-11-12T12:58:18", + "edited": "2020-11-12T13:03:00", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "italic", + "text": "italic\n\n" + }, + { + "type": "bold", + "text": "bold\n\n" + }, + { + "type": "underline", + "text": "underline\n\n" + }, + { + "type": "strikethrough", + "text": "strikethrough\n\n" + }, + { + "type": "pre", + "text": "monospace", + "language": "" + } + ] + }, + { + "id": 7, + "type": "message", + "date": "2020-11-12T13:01:05", + "from": "test", + "from_id": 9882011936, + "file": "voice_messages/audio_1@12-11-2020_13-01-05.ogg", + "media_type": "voice_message", + "mime_type": "audio/ogg", + "duration_seconds": 2, + "text": "" + }, + { + "id": 8, + "type": "message", + "date": "2020-11-12T13:02:35", + "from": "test", + "from_id": 9882011936, + "file": "files/test-sound.ogg", + "media_type": "audio_file", + "mime_type": "audio/x-vorbis+ogg", + "duration_seconds": 1, + "text": "" + }, + { + "id": 9, + "type": "message", + "date": "2020-11-12T13:43:23", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "italic", + "text": "italic" + }, + " in text\n\n", + { + "type": "italic", + "text": "italic on whole line" + } + ] + }, + { + "id": 10, + "type": "message", + "date": "2020-11-12T14:11:35", + "edited": "2020-11-12T14:18:17", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "bold", + "text": "bold" + }, + " in text\n", + { + "type": "bold", + "text": "bold below\n\n" + }, + "normal text\n\n", + { + "type": "bold", + "text": "bold with one line gap\n" + }, + "normal text" + ] + }, + { + "id": 11, + "type": "message", + "date": "2020-11-12T14:34:47", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "bold", + "text": "multiline\nbold\n\ntext" + } + ] + }, + { + "id": 13, + "type": "message", + "date": "2020-11-12T14:45:43", + "edited": "2020-11-12T14:45:55", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "code", + "text": "monospace with one baktick\n\n" + }, + { + "type": "pre", + "text": "multiline\nmonospace", + "language": "" + } + ] + }, + { + "id": 14, + "type": "message", + "date": "2020-11-12T14:48:48", + "edited": "2020-11-12T14:49:39", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "text_link", + "text": "text link\n\n", + "href": "http://example.com/" + }, + { + "type": "link", + "text": "example.com" + }, + "\n\n", + { + "type": "email", + "text": "example@example.com" + } + ] + }, + { + "id": 15, + "type": "message", + "date": "2020-11-12T15:05:32", + "from": "test", + "from_id": 9882011936, + "text": [ + { + "type": "italic", + "text": "bold italic" + } + ] + } + ] +}