add argparse

and do some refactoring
author: la-ninpre <leobrekalini@gmail.com> 2020-12-10 20:09:50 +0300
committer: la-ninpre <leobrekalini@gmail.com> 2020-12-10 20:09:50 +0300
commit: 4c57d96e545b73e0423566f22695179f58341abf (patch)
tree: 191d0875e8ab2655a9f325040b836fd50cab81a9
parent: 63d6cfec84bfa750604734e269d3c1c9e3c1fc04 (diff)
download: tg2md-4c57d96e545b73e0423566f22695179f58341abf.tar.gz
tg2md-4c57d96e545b73e0423566f22695179f58341abf.zip
1 files changed, 111 insertions, 45 deletions
diff --git a/tg2md.py b/tg2md.py
index ca4ef35..90e219d 100644
--- a/tg2md.py
+++ b/tg2md.py
@@ -11,37 +11,61 @@
 
 import os
 import sys
+import argparse
 import json
 from datetime import datetime
 
 
-def print_post_header(post_title, post_date, post_tag):
+def print_default_post_header(post_title, post_date, post_tag):
+
+    '''
+    returns default post header
+    '''
+
     # TODO: handle post tag/tags
     # TODO: support for custom header
-    post_header = '---\ntitle: {title}\ndate: {date}\n\
-tag: {tag}\nlayout: post\n---\n'.format(\
-            title=post_title, date=post_date, tag=post_tag)
+    post_header = '---\n'\
+        'title: {title}\n'\
+        'date: {date}\n'\
+        'tags: {tag}\n'\
+        'layout: post\n'\
+        '---\n'.format(title=post_title, date=post_date, tag=post_tag)
 
     return post_header
 
 
-def parse_post_photo(post, media_dir):
-    post_photo_src = post['photo'][7:]
-    post_photo_src = media_dir + '/' + post_photo_src
-    post_photo = '![image]({src})\n\n'.format(\
-            src=post_photo_src)
+def print_custom_post_header(post_header_file, *args):
+
+    '''
+    now unusable (i dunno how it may work)
+    '''
+
+    with post_header_file as f:
+        post_header_content = read(post_header_file)
+    for arg in args:
+        pass
+    return post_header_content
 
-    return post_photo
 
+def parse_post_photo(post, photo_dir):
 
-# def md_str(string):
-    # string = string.replace('\n','\n\n')
-    # string = string.replace('. ', '.\n')
+    '''
+    converts photo tag to markdown image link
+    '''
 
-    # return string
+    post_photo_src = os.path.basename(post['photo'])
+    post_photo_src = os.path.join(photo_dir, post_photo_src)
+    post_photo = '![image]({src})\n\n'.format(src=post_photo_src)
+
+    return post_photo
 
 
 def text_format(string, fmt):
+
+    '''
+    wraps string in markdown-styled formatting
+    '''
+
     if fmt in ('*', '**', '***', '`', '```'):
         output = '{fmt}{txt}{fmt}'
     elif fmt == '```':
@@ -53,7 +77,13 @@ def text_format(string, fmt):
     output += '\n' * string.split('\n').count('') * string.endswith('\n')
     return output
 
+
 def text_link_format(text, link):
+
+    '''
+    formats links
+    '''
+
     # convert telegram links to anchors
     # this implies that telegram links are pointing to the same channel
     if link.startswith('https://t.me/c/'):
@@ -66,6 +96,10 @@ def text_link_format(text, link):
 
 def parse_text_object(obj):
 
+    '''
+    detects type of text object and wraps it in corresponding formatting
+    '''
+
     obj_type = obj['type']
     obj_text = obj['text']
 
@@ -79,7 +113,7 @@ def parse_text_object(obj):
     elif obj_type == 'link' or obj_type == 'email':
         link = obj_text.strip()
         link = 'https://' * (obj_type == 'link') * \
-                (1 - link.startswith('https://')) + link
+            (1 - link.startswith('https://')) + link
         post_link = '<{href}>'.format(href=link)
         return post_link
 
@@ -124,23 +158,32 @@ def parse_post_text(post):
 
 
 def parse_post_media(post, media_dir):
+
+    '''
+    wraps file links into html tags
+    '''
+
     # get filename without parent directory
-    post_media_src = post['file'][post['file'].rfind("/") + 1:]
+    post_media_src = os.path.basename(post['file'])
 
     # add parent directory
-    post_media_src = media_dir + '/' + post_media_src
+    post_media_src = os.path.join(media_dir, post_media_src)
     post_media = '\n<audio controls>\n \
         <source src="{src}" type="{mime_type}">\n \
         </audio>'.format(src=post_media_src, mime_type=post['mime_type'])
 
     return post_media
-    
 
-def parse_post(post):
+
+def parse_post(post, photo_dir, media_dir):
+
+    '''
+    converts post object to formatted text
+    '''
+
     post_output = ''
-    
+
     # optional image
-    photo_dir = '/photos'
     if 'photo' in post:
         post_output += str(parse_post_photo(post, photo_dir))
 
@@ -148,7 +191,6 @@ def parse_post(post):
     post_output += str(parse_post_text(post))
 
     # optional media
-    media_dir = '/files'
     if 'media_type' in post:
         post_output += str(parse_post_media(post, media_dir))
 
@@ -156,47 +198,71 @@ def parse_post(post):
 
 
 def main():
-    # try directory from first argument
-    try:
-        input_dir = sys.argv[1]
-    except IndexError as e:
-        # if it's not specified, use current directory
-        input_dir = '.'
 
-    # create output directory
-    out_dir = input_dir + '/' + 'formatted_posts'
+    parser = argparse.ArgumentParser(
+            usage='%(prog)s [options] json_file',
+            description='Convert exported Telegram channel data json to \
+                    bunch of markdown posts ready to use with jekyll')
+    parser.add_argument(
+            'json', metavar='json_file',
+            type=open,
+            help='result.json file from telegram export')
+    parser.add_argument(
+            '--out-dir', metavar='out_dir',
+            nargs='?', default='formatted_posts',
+            help='output directory for markdown files\
+                    (default: formatted_posts)')
+    parser.add_argument(
+            '--photo-dir', metavar='photo_dir',
+            nargs='?', default='photos',
+            help='location of image files. this changes only links\
+                    to photos in markdown text, so specify your\
+                    desired location (default: photos)')
+    parser.add_argument(
+            '--media-dir', metavar='media_dir',
+            nargs='?', default='files',
+            help='location of media files. this changes only links\
+                    to files in markdown text, so specify your \
+                    desired location (default: files)')
+    args_wip = parser.add_argument_group('work in progress')
+    args_wip.add_argument(
+            '--post-header', metavar='post_header',
+            nargs='?',
+            help='yaml front matter for your posts \
+                    (now doesn\'t work)')
+
+    args = parser.parse_args()
+
     try:
-        os.mkdir(out_dir)
-    except FileExistsError as e:
+        os.mkdir(args.out_dir)
+    except FileExistsError:
         pass
 
     # load json file
-    json_path = input_dir + '/' + 'result.json'
     try:
-        with open(json_path, 'r') as f:
+        with args.json as f:
             data = json.load(f)
-    except FileNotFoundError as e:
-        sys.exit('result.json not found.\nPlease, specify right directory')
+    except FileNotFoundError:
+        sys.exit('result.json not found.\nPlease, specify right file')
 
     # load only messages
     raw_posts = data['messages']
 
     for post in raw_posts:
-    # TODO: handle forwarded posts
+        # TODO: handle forwarded posts
         if post['type'] == 'message' and 'forwarded_from' not in post:
 
             post_date = datetime.fromisoformat(post['date'])
             post_id = post['id']
-            post_filename = out_dir + '/' + str(post_date.date()) + '-' \
-                    + str(post_id) + '.md'
+            post_filename = str(post_date.date()) + '-' + str(post_id) + '.md'
+            post_path = os.path.join(args.out_dir, post_filename)
 
-            with open (post_filename, 'w') as f:
-                print(print_post_header(
-                    post_id, post_date, None), 
-                    file=f)
-                print(parse_post(post), file=f)
+            with open(post_path, 'w') as f:
+                print(print_default_post_header(
+                    post_id, post_date, None),
+                        file=f)
+                print(parse_post(post, args.photo_dir, args.media_dir), file=f)
 
 
 if __name__ == '__main__':
     main()
-
author	la-ninpre <leobrekalini@gmail.com>	2020-12-10 20:09:50 +0300
committer	la-ninpre <leobrekalini@gmail.com>	2020-12-10 20:09:50 +0300
commit	4c57d96e545b73e0423566f22695179f58341abf (patch)
tree	191d0875e8ab2655a9f325040b836fd50cab81a9
parent	63d6cfec84bfa750604734e269d3c1c9e3c1fc04 (diff)
download	tg2md-4c57d96e545b73e0423566f22695179f58341abf.tar.gz tg2md-4c57d96e545b73e0423566f22695179f58341abf.zip