aaoth.xyz repos — tg2md: 767dc7f6954a66e187e5eefc36c21868048e04fc

add tests

i've found some issues with formatted text, this could be seen in output
(which is not included now). there are some complications with parsing
newline characters, but i'll fix it later.

also script is now usable like command-line application. you can specify
the directory in which source files are located.

to make some tests you need to run
$ python parse.py tests

this will create directory tests/formatted_posts, where markdown posts
will be populated.

if directory is not specified, it assumes that script is in the source
    folder. if needed json file could not be found, it exits with code
    1.

la-ninpre leobrekalini@gmail.com

Thu, 12 Nov 2020 22:38:08 +0300

commit

767dc7f6954a66e187e5eefc36c21868048e04fc

parent

c11899c33dbd1dbf0865c9d99ccea95d9922df0b

5 files changed, 243 insertions(+), 15 deletions(-)

jump to

parse.py

tests/files/test-sound.ogg

tests/photos/photo_1@12-11-2020_12-57-31.jpg

tests/result.json

tests/voice_messages/audio_1@12-11-2020_13-01-05.ogg

M parse.py → parse.py

@@ -13,7 +13,7 @@ # - single/muliple tags
 # - forwarded posts
 
 import os
-# import sys
+import sys
 import json
 from datetime import datetime
 
@@ -36,8 +36,8 @@ return post_header
 
 def parse_post_photo(post, media_dir):
     post_photo_src = post['photo'][7:]
-    post_photo_src = media_dir + post_photo_src
-    post_photo = '![image](/assets/img/posts/{src})\n\n'.format(\
+    post_photo_src = media_dir + '/' + post_photo_src
+    post_photo = '![image]({src})\n\n'.format(\
             src=post_photo_src)
 
     return post_photo
@@ -80,23 +80,26 @@ # are treated as phone numbers, so return them as plain text.
     elif obj_type == 'phone':
         return obj_text
 
+    # output = '*{str}*'.format(str=string.strip())
+    # output += '\n' * string.count('\n') * string.endswith('\n')
+
     elif obj_type == 'bold':
-        post_inline_bold = '**{text}**'.format(text=obj_text)
+        post_inline_bold = '**{text}**'.format(text=obj_text.strip())
         return post_inline_bold
 
     elif obj_type == 'italic':
-        post_inline_italic = '*{text}*'.format(text=obj_text)
+        post_inline_italic = '*{text}*'.format(text=obj_text.strip())
         return post_inline_italic
 
     elif obj_type == 'underline':
-        post_inline_underline = '<u>{text}</u>'.format(text=obj_text)
+        post_inline_underline = '<u>{text}</u>'.format(text=obj_text.strip())
         return post_inline_underline
 
     elif obj_type == 'strikethrough':
-        post_inline_strike = '<s>{text}</s>'.format(text=obj_text)
+        post_inline_strike = '<s>{text}</s>'.format(text=obj_text.strip())
         return post_inline_strike
 
-    elif obj_type == 'code':
+    elif obj_type == 'code' or obj_type == 'pre':
         post_inline_code = '```\n{text}\n```'.format(text=obj_text)
         return post_inline_code
 
@@ -124,7 +127,7 @@ # get filename without parent directory
     post_media_src = post['file'][post['file'].rfind("/") + 1:]
 
     # add parent directory
-    post_media_src = media_dir + post_media_src
+    post_media_src = media_dir + '/' + post_media_src
     post_media = '\n<audio controls>\n \
         <source src="{src}" type="{mime_type}">\n \
         </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
@@ -136,15 +139,15 @@ def parse_post(post):
     post_output = ''
     
     # optional image
-    photo_dir = '/assets/img/posts/'
+    photo_dir = '/photos'
     if 'photo' in post:
         post_output += str(parse_post_photo(post, photo_dir))
 
     # post text
-    post_output += md_str(parse_post_text(post))
+    post_output += str(parse_post_text(post))
 
     # optional media
-    media_dir = '/assets/sound/posts/'
+    media_dir = '/files'
     if 'media_type' in post:
         post_output += str(parse_post_media(post, media_dir))
 
@@ -152,16 +155,27 @@ return post_output
 
 
 def main():
+    # try directory from first argument
+    try:
+        input_dir = sys.argv[1]
+    except IndexError as e:
+        # if it's not specified, use current directory
+        input_dir = '.'
+
     # create output directory
-    out_dir = './formatted_posts'
+    out_dir = input_dir + '/' + 'formatted_posts'
     try:
         os.mkdir(out_dir)
     except FileExistsError as e:
         pass
 
     # load json file
-    with open('result.json', 'r') as f:
-        data = json.load(f)
+    json_path = input_dir + '/' + 'result.json'
+    try:
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+    except FileNotFoundError as e:
+        sys.exit('result.json not found.\nPlease, specify right directory')
 
     # load only messages
     raw_posts = data['messages']

A tests/files/test-sound.ogg

A tests/photos/photo_1@12-11-2020_12-57-31.jpg

A tests/result.json

@@ -0,0 +1,214 @@
+{
+ "name": "test",
+ "type": "private_channel",
+ "id": 9882011936,
+ "messages": [
+  {
+   "id": 1,
+   "type": "service",
+   "date": "2020-11-12T12:53:52",
+   "actor": "test",
+   "actor_id": 9882011936,
+   "action": "create_channel",
+   "title": "test",
+   "text": ""
+  },
+  {
+   "id": 2,
+   "type": "message",
+   "date": "2020-11-12T12:54:07",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": "test text post"
+  },
+  {
+   "id": 3,
+   "type": "message",
+   "date": "2020-11-12T12:57:31",
+   "from": "test",
+   "from_id": 9882011936,
+   "photo": "photos/photo_1@12-11-2020_12-57-31.jpg",
+   "width": 801,
+   "height": 526,
+   "text": ""
+  },
+  {
+   "id": 4,
+   "type": "message",
+   "date": "2020-11-12T12:57:40",
+   "from": "test",
+   "from_id": 9882011936,
+   "photo": "photos/photo_1@12-11-2020_12-57-31.jpg",
+   "width": 801,
+   "height": 526,
+   "text": "photo with text"
+  },
+  {
+   "id": 5,
+   "type": "message",
+   "date": "2020-11-12T12:58:18",
+   "edited": "2020-11-12T13:03:00",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "italic",
+     "text": "italic\n\n"
+    },
+    {
+     "type": "bold",
+     "text": "bold\n\n"
+    },
+    {
+     "type": "underline",
+     "text": "underline\n\n"
+    },
+    {
+     "type": "strikethrough",
+     "text": "strikethrough\n\n"
+    },
+    {
+     "type": "pre",
+     "text": "monospace",
+     "language": ""
+    }
+   ]
+  },
+  {
+   "id": 7,
+   "type": "message",
+   "date": "2020-11-12T13:01:05",
+   "from": "test",
+   "from_id": 9882011936,
+   "file": "voice_messages/audio_1@12-11-2020_13-01-05.ogg",
+   "media_type": "voice_message",
+   "mime_type": "audio/ogg",
+   "duration_seconds": 2,
+   "text": ""
+  },
+  {
+   "id": 8,
+   "type": "message",
+   "date": "2020-11-12T13:02:35",
+   "from": "test",
+   "from_id": 9882011936,
+   "file": "files/test-sound.ogg",
+   "media_type": "audio_file",
+   "mime_type": "audio/x-vorbis+ogg",
+   "duration_seconds": 1,
+   "text": ""
+  },
+  {
+   "id": 9,
+   "type": "message",
+   "date": "2020-11-12T13:43:23",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "italic",
+     "text": "italic"
+    },
+    " in text\n\n",
+    {
+     "type": "italic",
+     "text": "italic on whole line"
+    }
+   ]
+  },
+  {
+   "id": 10,
+   "type": "message",
+   "date": "2020-11-12T14:11:35",
+   "edited": "2020-11-12T14:18:17",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "bold",
+     "text": "bold"
+    },
+    " in text\n",
+    {
+     "type": "bold",
+     "text": "bold below\n\n"
+    },
+    "normal text\n\n",
+    {
+     "type": "bold",
+     "text": "bold with one line gap\n"
+    },
+    "normal text"
+   ]
+  },
+  {
+   "id": 11,
+   "type": "message",
+   "date": "2020-11-12T14:34:47",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "bold",
+     "text": "multiline\nbold\n\ntext"
+    }
+   ]
+  },
+  {
+   "id": 13,
+   "type": "message",
+   "date": "2020-11-12T14:45:43",
+   "edited": "2020-11-12T14:45:55",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "code",
+     "text": "monospace with one baktick\n\n"
+    },
+    {
+     "type": "pre",
+     "text": "multiline\nmonospace",
+     "language": ""
+    }
+   ]
+  },
+  {
+   "id": 14,
+   "type": "message",
+   "date": "2020-11-12T14:48:48",
+   "edited": "2020-11-12T14:49:39",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "text_link",
+     "text": "text link\n\n",
+     "href": "http://example.com/"
+    },
+    {
+     "type": "link",
+     "text": "example.com"
+    },
+    "\n\n",
+    {
+     "type": "email",
+     "text": "example@example.com"
+    }
+   ]
+  },
+  {
+   "id": 15,
+   "type": "message",
+   "date": "2020-11-12T15:05:32",
+   "from": "test",
+   "from_id": 9882011936,
+   "text": [
+    {
+     "type": "italic",
+     "text": "bold italic"
+    }
+   ]
+  }
+ ]
+}

A tests/voice_messages/audio_1@12-11-2020_13-01-05.ogg

all repos — tg2md @ 767dc7f6954a66e187e5eefc36c21868048e04fc

converter from telegram json to jekyll md