journal/journal.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

package journal

import (
	"bytes"
	"fmt"
	"io"
	"log"
	"strings"
	"time"

	"git.sr.ht/~m15o/htmlj"
	"golang.org/x/net/html"
)

const (
	JournalTitleTag   = "h1"
	EntryTitleTag     = "h2"
	EntryContainerTag = "article"
	DateFormat        = time.DateOnly
)

// findNode finds first instance of tag s in n and returns pointer to it
func findNode(n *html.Node, s string) *html.Node {
	var c *html.Node
	if n == nil {
		return nil
	}
	for c = n.FirstChild; c != nil; c = c.NextSibling {
		if c.Type == html.ElementNode {
			if c.Data == s {
				break
			}
			if i := findNode(c, s); i != nil {
				c = i
				break
			}
		}
	}
	return c
}

// Parse is a stricter version of htmlj.Parse.
//
// This version extracts only the first journal from a given html.
// It only accepts journals that are flat, like:
//
//	 ...
//	 <h1>journal</h1>
//	 ...
//	 <article>
//	     <h2>2025-12-10</h2>
//	     <p>test</p>
//	 </article>
//	...
//
// but won't accept other structures, while original will grab
// entries from arbitrarily nested parts of html.
func Parse(r io.Reader) (*htmlj.Journal, error) {
	h, err := html.Parse(r)
	if err != nil {
		return nil, err
	}
	jt := findNode(h, JournalTitleTag)
	if jt == nil {
		return nil, fmt.Errorf("journal not found")
	}

	j := htmlj.Journal{
		Title: strings.TrimSpace(jt.FirstChild.Data),
	}
	var b bytes.Buffer
	for s := jt.NextSibling; s != nil; s = s.NextSibling {
		if s.Type != html.ElementNode || s.Data != EntryContainerTag {
			continue
		}

		et := findNode(s, EntryTitleTag)
		if et == nil {
			continue
		}

		title := strings.TrimSpace(et.FirstChild.Data)
		if len(title) < 10 {
			continue
		}
		t, err := time.Parse(DateFormat, title[0:10])
		if err != nil {
			continue
		}

		if j.Updated.Before(t) {
			j.Updated = t
		}
		b.Reset()
		s.RemoveChild(et)
		for c := s.FirstChild; c != nil; c = c.NextSibling {
			if err := html.Render(&b, c); err != nil {
				// can this ever happen?
				return nil, err
			}
		}
		j.Entries = append(j.Entries, htmlj.Entry{
			Title:     title,
			Published: t,
			Content:   strings.TrimSpace(b.String()),
		})
		if len(j.Entries) > 2 {
			l := len(j.Entries) - 1
			t1, t2, t3 := j.Entries[l-2].Published,
				j.Entries[l-1].Published, j.Entries[l].Published
			if (t1.Before(t2) && t2.After(t3)) ||
				(t1.After(t2) && t2.Before(t3)) {
				log.Println("warning: non-consecutive log entry:", title)
			}
		}
	}

	return &j, nil
}