aboutsummaryrefslogtreecommitdiffstats
path: root/journal
diff options
context:
space:
mode:
Diffstat (limited to 'journal')
-rw-r--r--journal/journal.go119
1 files changed, 119 insertions, 0 deletions
diff --git a/journal/journal.go b/journal/journal.go
new file mode 100644
index 0000000..e2e1bea
--- /dev/null
+++ b/journal/journal.go
@@ -0,0 +1,119 @@
+package journal
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "log"
+ "strings"
+ "time"
+
+ "git.sr.ht/~m15o/htmlj"
+ "golang.org/x/net/html"
+)
+
+const (
+ JournalTitleTag = "h1"
+ EntryTitleTag = "h2"
+ EntryContainerTag = "article"
+ DateFormat = time.DateOnly
+)
+
+// findNode finds first instance of tag s in n and returns pointer to it
+func findNode(n *html.Node, s string) *html.Node {
+ var c *html.Node
+ if n == nil {
+ return nil
+ }
+ for c = n.FirstChild; c != nil; c = c.NextSibling {
+ if c.Type == html.ElementNode {
+ if c.Data == s {
+ break
+ }
+ if i := findNode(c, s); i != nil {
+ c = i
+ break
+ }
+ }
+ }
+ return c
+}
+
+// Parse is a stricter version of htmlj.Parse.
+//
+// This version extracts only the first journal from a given html.
+// It only accepts journals that are flat, like:
+//
+// ...
+// <h1>journal</h1>
+// ...
+// <article>
+// <h2>2025-12-10</h2>
+// <p>test</p>
+// </article>
+// ...
+//
+// but won't accept other structures, while original will grab
+// entries from arbitrarily nested parts of html.
+func Parse(r io.Reader) (*htmlj.Journal, error) {
+ h, err := html.Parse(r)
+ if err != nil {
+ return nil, err
+ }
+ jt := findNode(h, JournalTitleTag)
+ if jt == nil {
+ return nil, fmt.Errorf("journal not found")
+ }
+
+ j := htmlj.Journal{
+ Title: strings.TrimSpace(jt.FirstChild.Data),
+ }
+ var b bytes.Buffer
+ for s := jt.NextSibling; s != nil; s = s.NextSibling {
+ if s.Type != html.ElementNode || s.Data != EntryContainerTag {
+ continue
+ }
+
+ et := findNode(s, EntryTitleTag)
+ if et == nil {
+ continue
+ }
+
+ title := strings.TrimSpace(et.FirstChild.Data)
+ if len(title) < 10 {
+ continue
+ }
+ t, err := time.Parse(DateFormat, title[0:10])
+ if err != nil {
+ continue
+ }
+
+ if j.Updated.Before(t) {
+ j.Updated = t
+ }
+ b.Reset()
+ s.RemoveChild(et)
+ for c := s.FirstChild; c != nil; c = c.NextSibling {
+ if err := html.Render(&b, c); err != nil {
+ // can this ever happen?
+ return nil, err
+ }
+ }
+ j.Entries = append(j.Entries, htmlj.Entry{
+ Title: title,
+ Published: t,
+ Content: strings.TrimSpace(b.String()),
+ })
+ if len(j.Entries) > 2 {
+ l := len(j.Entries) - 1
+ t1, t2, t3 := j.Entries[l-2].Published,
+ j.Entries[l-1].Published, j.Entries[l].Published
+ if (t1.Before(t2) && t2.After(t3)) ||
+ (t1.After(t2) && t2.Before(t3)) {
+ log.Println("warning: non-consecutive log entry:", title)
+ }
+ }
+ }
+
+ return &j, nil
+}