all repos — tg2md @ 2d26d28c8f891bae27bf075b443b1e82185e4a37

converter from telegram json to jekyll md

fix formatted text parsing

earlier, situations when there were strings like 'bold\ntext\n' that
needed to be treated as bold text were converted in a wrong way (e.g.
like '**bold\ntext\n**', which is breaking markup because of newline
characters before closing asterisks), but now it's kind of fixed.

also there's now support for emails and <pre> blocks.
la-ninpre leobrekalini@gmail.com
Mon, 16 Nov 2020 19:45:19 +0300
commit

2d26d28c8f891bae27bf075b443b1e82185e4a37

parent

767dc7f6954a66e187e5eefc36c21868048e04fc

M parse.pyparse.py

@@ -11,29 +11,24 @@ # TODO summary:

# - replies # - single/muliple tags # - forwarded posts +# - custom post header import os import sys import json from datetime import datetime -# post: -# header -# [photo?] -# text -# [media?] - -# text: -# [str|list(str|obj, ...)] def print_post_header(post_title, post_date, post_tag): # TODO: handle post tag/tags + # TODO: support for custom header post_header = '---\ntitle: {title}\ndate: {date}\n\ tag: {tag}\nlayout: post\n---\n'.format(\ title=post_title, date=post_date, tag=post_tag) return post_header + def parse_post_photo(post, media_dir): post_photo_src = post['photo'][7:] post_photo_src = media_dir + '/' + post_photo_src

@@ -42,22 +37,34 @@ src=post_photo_src)

return post_photo -def md_str(string): - string = string.replace('\n','\n\n') - string = string.replace('. ', '.\n') - return string +# def md_str(string): + # string = string.replace('\n','\n\n') + # string = string.replace('. ', '.\n') + # return string -def parse_text_object(obj): - ''' - Parse object from post text. - Objects are text links, plain links, underlined text, strikethrough text, - italic text, bold text, code blocks and hashtags. +def text_format(string, fmt): + if fmt in ('*', '**', '***', '`', '```'): + output = '{fmt}{txt}{fmt}' + elif fmt == '```': + output = '{fmt}\n{txt}\n{fmt}' + else: + output = '<{fmt}>{txt}</{fmt}>' - This is a mess, but what is better? - ''' + output = output.format(fmt=fmt, txt=string.strip()) + output += '\n' * string.split('\n').count('') * string.endswith('\n') + return output + +def text_link_format(text, link): + link_fmt = '[{text}]({href})' + link_fmt = link_fmt.format(text=text.strip(), href=link) + link_fmt += '\n' * text.count('\n') * text.endswith('\n') + return link_fmt + + +def parse_text_object(obj): obj_type = obj['type'] obj_text = obj['text']

@@ -67,41 +74,32 @@ post_tag = obj_text

return post_tag elif obj_type == 'text_link': - post_text_link = '[{text}]({href})'.format(text=obj_text, \ - href=obj['href']) - return post_text_link + return text_link_format(obj_text, obj['href']) - elif obj_type == 'link': - post_link = '[link]({href})'.format(href=obj_text) + elif obj_type == 'link' or obj_type == 'email': + post_link = '<{href}>'.format(href=obj_text.strip()) return post_link - # I dunno how this appeared, but it seems like hyphenated numbers - # are treated as phone numbers, so return them as plain text. elif obj_type == 'phone': return obj_text - # output = '*{str}*'.format(str=string.strip()) - # output += '\n' * string.count('\n') * string.endswith('\n') + elif obj_type == 'italic': + return text_format(obj_text, '*') elif obj_type == 'bold': - post_inline_bold = '**{text}**'.format(text=obj_text.strip()) - return post_inline_bold + return text_format(obj_text, '**') - elif obj_type == 'italic': - post_inline_italic = '*{text}*'.format(text=obj_text.strip()) - return post_inline_italic + elif obj_type == 'code': + return text_format(obj_text, '`') + + elif obj_type == 'pre': + return text_format(obj_text, '```') elif obj_type == 'underline': - post_inline_underline = '<u>{text}</u>'.format(text=obj_text.strip()) - return post_inline_underline + return text_format(obj_text, 'u') elif obj_type == 'strikethrough': - post_inline_strike = '<s>{text}</s>'.format(text=obj_text.strip()) - return post_inline_strike - - elif obj_type == 'code' or obj_type == 'pre': - post_inline_code = '```\n{text}\n```'.format(text=obj_text) - return post_inline_code + return text_format(obj_text, 's') def parse_post_text(post):
A tests/formatted_posts/2020-11-12-10.md

@@ -0,0 +1,14 @@

+--- +title: 10 +date: 2020-11-12 14:11:35 +tag: None +layout: post +--- + +**bold** in text +**bold below** + +normal text + +**bold with one line gap** +normal text
A tests/formatted_posts/2020-11-12-11.md

@@ -0,0 +1,11 @@

+--- +title: 11 +date: 2020-11-12 14:34:47 +tag: None +layout: post +--- + +**multiline +bold + +text**
A tests/formatted_posts/2020-11-12-13.md

@@ -0,0 +1,11 @@

+--- +title: 13 +date: 2020-11-12 14:45:43 +tag: None +layout: post +--- + +`monospace with one baktick` + +```multiline +monospace```
A tests/formatted_posts/2020-11-12-14.md

@@ -0,0 +1,12 @@

+--- +title: 14 +date: 2020-11-12 14:48:48 +tag: None +layout: post +--- + +[text link](http://example.com/) + +<example.com> + +<example@example.com>
A tests/formatted_posts/2020-11-12-15.md

@@ -0,0 +1,8 @@

+--- +title: 15 +date: 2020-11-12 15:05:32 +tag: None +layout: post +--- + +*bold italic*
A tests/formatted_posts/2020-11-12-2.md

@@ -0,0 +1,8 @@

+--- +title: 2 +date: 2020-11-12 12:54:07 +tag: None +layout: post +--- + +test text post
A tests/formatted_posts/2020-11-12-3.md

@@ -0,0 +1,10 @@

+--- +title: 3 +date: 2020-11-12 12:57:31 +tag: None +layout: post +--- + +![image](/photos/photo_1@12-11-2020_12-57-31.jpg) + +
A tests/formatted_posts/2020-11-12-4.md

@@ -0,0 +1,10 @@

+--- +title: 4 +date: 2020-11-12 12:57:40 +tag: None +layout: post +--- + +![image](/photos/photo_1@12-11-2020_12-57-31.jpg) + +photo with text
A tests/formatted_posts/2020-11-12-5.md

@@ -0,0 +1,16 @@

+--- +title: 5 +date: 2020-11-12 12:58:18 +tag: None +layout: post +--- + +*italic* + +**bold** + +<u>underline</u> + +<s>strikethrough</s> + +```monospace```
A tests/formatted_posts/2020-11-12-7.md

@@ -0,0 +1,11 @@

+--- +title: 7 +date: 2020-11-12 13:01:05 +tag: None +layout: post +--- + + +<audio controls> + <source src="/files/audio_1@12-11-2020_13-01-05.ogg" type="audio/ogg"> + </audio>
A tests/formatted_posts/2020-11-12-8.md

@@ -0,0 +1,11 @@

+--- +title: 8 +date: 2020-11-12 13:02:35 +tag: None +layout: post +--- + + +<audio controls> + <source src="/files/test-sound.ogg" type="audio/x-vorbis+ogg"> + </audio>
A tests/formatted_posts/2020-11-12-9.md

@@ -0,0 +1,10 @@

+--- +title: 9 +date: 2020-11-12 13:43:23 +tag: None +layout: post +--- + +*italic* in text + +*italic on whole line*