init

author: la-ninpre <aaoth@aaoth.xyz> 2025-12-10 22:48:14 +0300
committer: la-ninpre <aaoth@aaoth.xyz> 2025-12-10 22:48:14 +0300
commit: f3fcd25e29d1a8c4d355eb650091e2920def658e (patch)
tree: 7f537b44a47359e5437e67251b3a5d4de1bb642c /journal
download: html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.tar.gz
html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.zip
1 files changed, 119 insertions, 0 deletions
diff --git a/journal/journal.go b/journal/journal.go
new file mode 100644
index 0000000..e2e1bea
--- /dev/null
+++ b/journal/journal.go
@@ -0,0 +1,119 @@
+package journal
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+	"strings"
+	"time"
+
+	"git.sr.ht/~m15o/htmlj"
+	"golang.org/x/net/html"
+)
+
+const (
+	JournalTitleTag   = "h1"
+	EntryTitleTag     = "h2"
+	EntryContainerTag = "article"
+	DateFormat        = time.DateOnly
+)
+
+// findNode finds first instance of tag s in n and returns pointer to it
+func findNode(n *html.Node, s string) *html.Node {
+	var c *html.Node
+	if n == nil {
+		return nil
+	}
+	for c = n.FirstChild; c != nil; c = c.NextSibling {
+		if c.Type == html.ElementNode {
+			if c.Data == s {
+				break
+			}
+			if i := findNode(c, s); i != nil {
+				c = i
+				break
+			}
+		}
+	}
+	return c
+}
+
+// Parse is a stricter version of htmlj.Parse.
+//
+// This version extracts only the first journal from a given html.
+// It only accepts journals that are flat, like:
+//
+//	 ...
+//	 <h1>journal</h1>
+//	 ...
+//	 <article>
+//	     <h2>2025-12-10</h2>
+//	     <p>test</p>
+//	 </article>
+//	...
+//
+// but won't accept other structures, while original will grab
+// entries from arbitrarily nested parts of html.
+func Parse(r io.Reader) (*htmlj.Journal, error) {
+	h, err := html.Parse(r)
+	if err != nil {
+		return nil, err
+	}
+	jt := findNode(h, JournalTitleTag)
+	if jt == nil {
+		return nil, fmt.Errorf("journal not found")
+	}
+
+	j := htmlj.Journal{
+		Title: strings.TrimSpace(jt.FirstChild.Data),
+	}
+	var b bytes.Buffer
+	for s := jt.NextSibling; s != nil; s = s.NextSibling {
+		if s.Type != html.ElementNode || s.Data != EntryContainerTag {
+			continue
+		}
+
+		et := findNode(s, EntryTitleTag)
+		if et == nil {
+			continue
+		}
+
+		title := strings.TrimSpace(et.FirstChild.Data)
+		if len(title) < 10 {
+			continue
+		}
+		t, err := time.Parse(DateFormat, title[0:10])
+		if err != nil {
+			continue
+		}
+
+		if j.Updated.Before(t) {
+			j.Updated = t
+		}
+		b.Reset()
+		s.RemoveChild(et)
+		for c := s.FirstChild; c != nil; c = c.NextSibling {
+			if err := html.Render(&b, c); err != nil {
+				// can this ever happen?
+				return nil, err
+			}
+		}
+		j.Entries = append(j.Entries, htmlj.Entry{
+			Title:     title,
+			Published: t,
+			Content:   strings.TrimSpace(b.String()),
+		})
+		if len(j.Entries) > 2 {
+			l := len(j.Entries) - 1
+			t1, t2, t3 := j.Entries[l-2].Published,
+				j.Entries[l-1].Published, j.Entries[l].Published
+			if (t1.Before(t2) && t2.After(t3)) ||
+				(t1.After(t2) && t2.Before(t3)) {
+				log.Println("warning: non-consecutive log entry:", title)
+			}
+		}
+	}
+
+	return &j, nil
+}
author	la-ninpre <aaoth@aaoth.xyz>	2025-12-10 22:48:14 +0300
committer	la-ninpre <aaoth@aaoth.xyz>	2025-12-10 22:48:14 +0300
commit	f3fcd25e29d1a8c4d355eb650091e2920def658e (patch)
tree	7f537b44a47359e5437e67251b3a5d4de1bb642c /journal
download	html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.tar.gz html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.zip