aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorla-ninpre <leobrekalini@gmail.com>2020-11-16 19:45:19 +0300
committerla-ninpre <leobrekalini@gmail.com>2020-11-16 19:45:19 +0300
commit2d26d28c8f891bae27bf075b443b1e82185e4a37 (patch)
treec8e136490e16f2388e5927f9a54738978b0987fb
parent767dc7f6954a66e187e5eefc36c21868048e04fc (diff)
downloadtg2md-2d26d28c8f891bae27bf075b443b1e82185e4a37.tar.gz
tg2md-2d26d28c8f891bae27bf075b443b1e82185e4a37.zip
fix formatted text parsing
earlier, situations when there were strings like 'bold\ntext\n' that needed to be treated as bold text were converted in a wrong way (e.g. like '**bold\ntext\n**', which is breaking markup because of newline characters before closing asterisks), but now it's kind of fixed. also there's now support for emails and <pre> blocks.
-rw-r--r--parse.py80
-rw-r--r--tests/formatted_posts/2020-11-12-10.md14
-rw-r--r--tests/formatted_posts/2020-11-12-11.md11
-rw-r--r--tests/formatted_posts/2020-11-12-13.md11
-rw-r--r--tests/formatted_posts/2020-11-12-14.md12
-rw-r--r--tests/formatted_posts/2020-11-12-15.md8
-rw-r--r--tests/formatted_posts/2020-11-12-2.md8
-rw-r--r--tests/formatted_posts/2020-11-12-3.md10
-rw-r--r--tests/formatted_posts/2020-11-12-4.md10
-rw-r--r--tests/formatted_posts/2020-11-12-5.md16
-rw-r--r--tests/formatted_posts/2020-11-12-7.md11
-rw-r--r--tests/formatted_posts/2020-11-12-8.md11
-rw-r--r--tests/formatted_posts/2020-11-12-9.md10
13 files changed, 171 insertions, 41 deletions
diff --git a/parse.py b/parse.py
index 098af81..4a768d0 100644
--- a/parse.py
+++ b/parse.py
@@ -11,29 +11,24 @@
# - replies
# - single/muliple tags
# - forwarded posts
+# - custom post header
import os
import sys
import json
from datetime import datetime
-# post:
-# header
-# [photo?]
-# text
-# [media?]
-
-# text:
-# [str|list(str|obj, ...)]
def print_post_header(post_title, post_date, post_tag):
# TODO: handle post tag/tags
+ # TODO: support for custom header
post_header = '---\ntitle: {title}\ndate: {date}\n\
tag: {tag}\nlayout: post\n---\n'.format(\
title=post_title, date=post_date, tag=post_tag)
return post_header
+
def parse_post_photo(post, media_dir):
post_photo_src = post['photo'][7:]
post_photo_src = media_dir + '/' + post_photo_src
@@ -42,22 +37,34 @@ def parse_post_photo(post, media_dir):
return post_photo
-def md_str(string):
- string = string.replace('\n','\n\n')
- string = string.replace('. ', '.\n')
- return string
+# def md_str(string):
+ # string = string.replace('\n','\n\n')
+ # string = string.replace('. ', '.\n')
+ # return string
-def parse_text_object(obj):
- '''
- Parse object from post text.
- Objects are text links, plain links, underlined text, strikethrough text,
- italic text, bold text, code blocks and hashtags.
+def text_format(string, fmt):
+ if fmt in ('*', '**', '***', '`', '```'):
+ output = '{fmt}{txt}{fmt}'
+ elif fmt == '```':
+ output = '{fmt}\n{txt}\n{fmt}'
+ else:
+ output = '<{fmt}>{txt}</{fmt}>'
+
+ output = output.format(fmt=fmt, txt=string.strip())
+ output += '\n' * string.split('\n').count('') * string.endswith('\n')
+ return output
+
+def text_link_format(text, link):
+ link_fmt = '[{text}]({href})'
+ link_fmt = link_fmt.format(text=text.strip(), href=link)
+ link_fmt += '\n' * text.count('\n') * text.endswith('\n')
+ return link_fmt
- This is a mess, but what is better?
- '''
+
+def parse_text_object(obj):
obj_type = obj['type']
obj_text = obj['text']
@@ -67,41 +74,32 @@ def parse_text_object(obj):
return post_tag
elif obj_type == 'text_link':
- post_text_link = '[{text}]({href})'.format(text=obj_text, \
- href=obj['href'])
- return post_text_link
+ return text_link_format(obj_text, obj['href'])
- elif obj_type == 'link':
- post_link = '[link]({href})'.format(href=obj_text)
+ elif obj_type == 'link' or obj_type == 'email':
+ post_link = '<{href}>'.format(href=obj_text.strip())
return post_link
- # I dunno how this appeared, but it seems like hyphenated numbers
- # are treated as phone numbers, so return them as plain text.
elif obj_type == 'phone':
return obj_text
- # output = '*{str}*'.format(str=string.strip())
- # output += '\n' * string.count('\n') * string.endswith('\n')
+ elif obj_type == 'italic':
+ return text_format(obj_text, '*')
elif obj_type == 'bold':
- post_inline_bold = '**{text}**'.format(text=obj_text.strip())
- return post_inline_bold
+ return text_format(obj_text, '**')
- elif obj_type == 'italic':
- post_inline_italic = '*{text}*'.format(text=obj_text.strip())
- return post_inline_italic
+ elif obj_type == 'code':
+ return text_format(obj_text, '`')
+
+ elif obj_type == 'pre':
+ return text_format(obj_text, '```')
elif obj_type == 'underline':
- post_inline_underline = '<u>{text}</u>'.format(text=obj_text.strip())
- return post_inline_underline
+ return text_format(obj_text, 'u')
elif obj_type == 'strikethrough':
- post_inline_strike = '<s>{text}</s>'.format(text=obj_text.strip())
- return post_inline_strike
-
- elif obj_type == 'code' or obj_type == 'pre':
- post_inline_code = '```\n{text}\n```'.format(text=obj_text)
- return post_inline_code
+ return text_format(obj_text, 's')
def parse_post_text(post):
diff --git a/tests/formatted_posts/2020-11-12-10.md b/tests/formatted_posts/2020-11-12-10.md
new file mode 100644
index 0000000..dab9ba4
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-10.md
@@ -0,0 +1,14 @@
+---
+title: 10
+date: 2020-11-12 14:11:35
+tag: None
+layout: post
+---
+
+**bold** in text
+**bold below**
+
+normal text
+
+**bold with one line gap**
+normal text
diff --git a/tests/formatted_posts/2020-11-12-11.md b/tests/formatted_posts/2020-11-12-11.md
new file mode 100644
index 0000000..2e4b2c2
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-11.md
@@ -0,0 +1,11 @@
+---
+title: 11
+date: 2020-11-12 14:34:47
+tag: None
+layout: post
+---
+
+**multiline
+bold
+
+text**
diff --git a/tests/formatted_posts/2020-11-12-13.md b/tests/formatted_posts/2020-11-12-13.md
new file mode 100644
index 0000000..01fa3c6
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-13.md
@@ -0,0 +1,11 @@
+---
+title: 13
+date: 2020-11-12 14:45:43
+tag: None
+layout: post
+---
+
+`monospace with one baktick`
+
+```multiline
+monospace```
diff --git a/tests/formatted_posts/2020-11-12-14.md b/tests/formatted_posts/2020-11-12-14.md
new file mode 100644
index 0000000..092be7d
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-14.md
@@ -0,0 +1,12 @@
+---
+title: 14
+date: 2020-11-12 14:48:48
+tag: None
+layout: post
+---
+
+[text link](http://example.com/)
+
+<example.com>
+
+<example@example.com>
diff --git a/tests/formatted_posts/2020-11-12-15.md b/tests/formatted_posts/2020-11-12-15.md
new file mode 100644
index 0000000..b8e65e0
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-15.md
@@ -0,0 +1,8 @@
+---
+title: 15
+date: 2020-11-12 15:05:32
+tag: None
+layout: post
+---
+
+*bold italic*
diff --git a/tests/formatted_posts/2020-11-12-2.md b/tests/formatted_posts/2020-11-12-2.md
new file mode 100644
index 0000000..ded2627
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-2.md
@@ -0,0 +1,8 @@
+---
+title: 2
+date: 2020-11-12 12:54:07
+tag: None
+layout: post
+---
+
+test text post
diff --git a/tests/formatted_posts/2020-11-12-3.md b/tests/formatted_posts/2020-11-12-3.md
new file mode 100644
index 0000000..f9bfe3d
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-3.md
@@ -0,0 +1,10 @@
+---
+title: 3
+date: 2020-11-12 12:57:31
+tag: None
+layout: post
+---
+
+![image](/photos/photo_1@12-11-2020_12-57-31.jpg)
+
+
diff --git a/tests/formatted_posts/2020-11-12-4.md b/tests/formatted_posts/2020-11-12-4.md
new file mode 100644
index 0000000..8beb106
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-4.md
@@ -0,0 +1,10 @@
+---
+title: 4
+date: 2020-11-12 12:57:40
+tag: None
+layout: post
+---
+
+![image](/photos/photo_1@12-11-2020_12-57-31.jpg)
+
+photo with text
diff --git a/tests/formatted_posts/2020-11-12-5.md b/tests/formatted_posts/2020-11-12-5.md
new file mode 100644
index 0000000..13e734e
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-5.md
@@ -0,0 +1,16 @@
+---
+title: 5
+date: 2020-11-12 12:58:18
+tag: None
+layout: post
+---
+
+*italic*
+
+**bold**
+
+<u>underline</u>
+
+<s>strikethrough</s>
+
+```monospace```
diff --git a/tests/formatted_posts/2020-11-12-7.md b/tests/formatted_posts/2020-11-12-7.md
new file mode 100644
index 0000000..5a9e6ec
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-7.md
@@ -0,0 +1,11 @@
+---
+title: 7
+date: 2020-11-12 13:01:05
+tag: None
+layout: post
+---
+
+
+<audio controls>
+ <source src="/files/audio_1@12-11-2020_13-01-05.ogg" type="audio/ogg">
+ </audio>
diff --git a/tests/formatted_posts/2020-11-12-8.md b/tests/formatted_posts/2020-11-12-8.md
new file mode 100644
index 0000000..0b30a03
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-8.md
@@ -0,0 +1,11 @@
+---
+title: 8
+date: 2020-11-12 13:02:35
+tag: None
+layout: post
+---
+
+
+<audio controls>
+ <source src="/files/test-sound.ogg" type="audio/x-vorbis+ogg">
+ </audio>
diff --git a/tests/formatted_posts/2020-11-12-9.md b/tests/formatted_posts/2020-11-12-9.md
new file mode 100644
index 0000000..cbf1a9f
--- /dev/null
+++ b/tests/formatted_posts/2020-11-12-9.md
@@ -0,0 +1,10 @@
+---
+title: 9
+date: 2020-11-12 13:43:23
+tag: None
+layout: post
+---
+
+*italic* in text
+
+*italic on whole line*