package journal
import (
"bytes"
"fmt"
"io"
"log"
"strings"
"time"
"git.sr.ht/~m15o/htmlj"
"golang.org/x/net/html"
)
const (
JournalTitleTag = "h1"
EntryTitleTag = "h2"
EntryContainerTag = "article"
DateFormat = time.DateOnly
)
// findNode finds first instance of tag s in n and returns pointer to it
func findNode(n *html.Node, s string) *html.Node {
var c *html.Node
if n == nil {
return nil
}
for c = n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode {
if c.Data == s {
break
}
if i := findNode(c, s); i != nil {
c = i
break
}
}
}
return c
}
// Parse is a stricter version of htmlj.Parse.
//
// This version extracts only the first journal from a given html.
// It only accepts journals that are flat, like:
//
// ...
//
journal
// ...
//
// 2025-12-10
// test
//
// ...
//
// but won't accept other structures, while original will grab
// entries from arbitrarily nested parts of html.
func Parse(r io.Reader) (*htmlj.Journal, error) {
h, err := html.Parse(r)
if err != nil {
return nil, err
}
jt := findNode(h, JournalTitleTag)
if jt == nil {
return nil, fmt.Errorf("journal not found")
}
j := htmlj.Journal{
Title: strings.TrimSpace(jt.FirstChild.Data),
}
var b bytes.Buffer
for s := jt.NextSibling; s != nil; s = s.NextSibling {
if s.Type != html.ElementNode || s.Data != EntryContainerTag {
continue
}
et := findNode(s, EntryTitleTag)
if et == nil {
continue
}
title := strings.TrimSpace(et.FirstChild.Data)
if len(title) < 10 {
continue
}
t, err := time.Parse(DateFormat, title[0:10])
if err != nil {
continue
}
if j.Updated.Before(t) {
j.Updated = t
}
b.Reset()
s.RemoveChild(et)
for c := s.FirstChild; c != nil; c = c.NextSibling {
if err := html.Render(&b, c); err != nil {
// can this ever happen?
return nil, err
}
}
j.Entries = append(j.Entries, htmlj.Entry{
Title: title,
Published: t,
Content: strings.TrimSpace(b.String()),
})
if len(j.Entries) > 2 {
l := len(j.Entries) - 1
t1, t2, t3 := j.Entries[l-2].Published,
j.Entries[l-1].Published, j.Entries[l].Published
if (t1.Before(t2) && t2.After(t3)) ||
(t1.After(t2) && t2.Before(t3)) {
log.Println("warning: non-consecutive log entry:", title)
}
}
}
return &j, nil
}