package journal import ( "bytes" "fmt" "io" "log" "strings" "time" "git.sr.ht/~m15o/htmlj" "golang.org/x/net/html" ) const ( JournalTitleTag = "h1" EntryTitleTag = "h2" EntryContainerTag = "article" DateFormat = time.DateOnly ) // findNode finds first instance of tag s in n and returns pointer to it func findNode(n *html.Node, s string) *html.Node { var c *html.Node if n == nil { return nil } for c = n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode { if c.Data == s { break } if i := findNode(c, s); i != nil { c = i break } } } return c } // Parse is a stricter version of htmlj.Parse. // // This version extracts only the first journal from a given html. // It only accepts journals that are flat, like: // // ... //

journal

// ... //
//

2025-12-10

//

test

//
// ... // // but won't accept other structures, while original will grab // entries from arbitrarily nested parts of html. func Parse(r io.Reader) (*htmlj.Journal, error) { h, err := html.Parse(r) if err != nil { return nil, err } jt := findNode(h, JournalTitleTag) if jt == nil { return nil, fmt.Errorf("journal not found") } j := htmlj.Journal{ Title: strings.TrimSpace(jt.FirstChild.Data), } var b bytes.Buffer for s := jt.NextSibling; s != nil; s = s.NextSibling { if s.Type != html.ElementNode || s.Data != EntryContainerTag { continue } et := findNode(s, EntryTitleTag) if et == nil { continue } title := strings.TrimSpace(et.FirstChild.Data) if len(title) < 10 { continue } t, err := time.Parse(DateFormat, title[0:10]) if err != nil { continue } if j.Updated.Before(t) { j.Updated = t } b.Reset() s.RemoveChild(et) for c := s.FirstChild; c != nil; c = c.NextSibling { if err := html.Render(&b, c); err != nil { // can this ever happen? return nil, err } } j.Entries = append(j.Entries, htmlj.Entry{ Title: title, Published: t, Content: strings.TrimSpace(b.String()), }) if len(j.Entries) > 2 { l := len(j.Entries) - 1 t1, t2, t3 := j.Entries[l-2].Published, j.Entries[l-1].Published, j.Entries[l].Published if (t1.Before(t2) && t2.After(t3)) || (t1.After(t2) && t2.Before(t3)) { log.Println("warning: non-consecutive log entry:", title) } } } return &j, nil }