1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
package journal
import (
"bytes"
"fmt"
"io"
"log"
"strings"
"time"
"git.sr.ht/~m15o/htmlj"
"golang.org/x/net/html"
)
const (
JournalTitleTag = "h1"
EntryTitleTag = "h2"
EntryContainerTag = "article"
DateFormat = time.DateOnly
)
// findNode finds first instance of tag s in n and returns pointer to it
func findNode(n *html.Node, s string) *html.Node {
var c *html.Node
if n == nil {
return nil
}
for c = n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode {
if c.Data == s {
break
}
if i := findNode(c, s); i != nil {
c = i
break
}
}
}
return c
}
// Parse is a stricter version of htmlj.Parse.
//
// This version extracts only the first journal from a given html.
// It only accepts journals that are flat, like:
//
// ...
// <h1>journal</h1>
// ...
// <article>
// <h2>2025-12-10</h2>
// <p>test</p>
// </article>
// ...
//
// but won't accept other structures, while original will grab
// entries from arbitrarily nested parts of html.
func Parse(r io.Reader) (*htmlj.Journal, error) {
h, err := html.Parse(r)
if err != nil {
return nil, err
}
jt := findNode(h, JournalTitleTag)
if jt == nil {
return nil, fmt.Errorf("journal not found")
}
j := htmlj.Journal{
Title: strings.TrimSpace(jt.FirstChild.Data),
}
var b bytes.Buffer
for s := jt.NextSibling; s != nil; s = s.NextSibling {
if s.Type != html.ElementNode || s.Data != EntryContainerTag {
continue
}
et := findNode(s, EntryTitleTag)
if et == nil {
continue
}
title := strings.TrimSpace(et.FirstChild.Data)
if len(title) < 10 {
continue
}
t, err := time.Parse(DateFormat, title[0:10])
if err != nil {
continue
}
if j.Updated.Before(t) {
j.Updated = t
}
b.Reset()
s.RemoveChild(et)
for c := s.FirstChild; c != nil; c = c.NextSibling {
if err := html.Render(&b, c); err != nil {
// can this ever happen?
return nil, err
}
}
j.Entries = append(j.Entries, htmlj.Entry{
Title: title,
Published: t,
Content: strings.TrimSpace(b.String()),
})
if len(j.Entries) > 2 {
l := len(j.Entries) - 1
t1, t2, t3 := j.Entries[l-2].Published,
j.Entries[l-1].Published, j.Entries[l].Published
if (t1.Before(t2) && t2.After(t3)) ||
(t1.After(t2) && t2.Before(t3)) {
log.Println("warning: non-consecutive log entry:", title)
}
}
}
return &j, nil
}
|