diff options
| author | la-ninpre <aaoth@aaoth.xyz> | 2025-12-10 22:48:14 +0300 |
|---|---|---|
| committer | la-ninpre <aaoth@aaoth.xyz> | 2025-12-10 22:48:14 +0300 |
| commit | f3fcd25e29d1a8c4d355eb650091e2920def658e (patch) | |
| tree | 7f537b44a47359e5437e67251b3a5d4de1bb642c /journal | |
| download | html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.tar.gz html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.zip | |
init
Diffstat (limited to 'journal')
| -rw-r--r-- | journal/journal.go | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/journal/journal.go b/journal/journal.go new file mode 100644 index 0000000..e2e1bea --- /dev/null +++ b/journal/journal.go @@ -0,0 +1,119 @@ +package journal + +import ( + "bytes" + "fmt" + "io" + "log" + "strings" + "time" + + "git.sr.ht/~m15o/htmlj" + "golang.org/x/net/html" +) + +const ( + JournalTitleTag = "h1" + EntryTitleTag = "h2" + EntryContainerTag = "article" + DateFormat = time.DateOnly +) + +// findNode finds first instance of tag s in n and returns pointer to it +func findNode(n *html.Node, s string) *html.Node { + var c *html.Node + if n == nil { + return nil + } + for c = n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode { + if c.Data == s { + break + } + if i := findNode(c, s); i != nil { + c = i + break + } + } + } + return c +} + +// Parse is a stricter version of htmlj.Parse. +// +// This version extracts only the first journal from a given html. +// It only accepts journals that are flat, like: +// +// ... +// <h1>journal</h1> +// ... +// <article> +// <h2>2025-12-10</h2> +// <p>test</p> +// </article> +// ... +// +// but won't accept other structures, while original will grab +// entries from arbitrarily nested parts of html. +func Parse(r io.Reader) (*htmlj.Journal, error) { + h, err := html.Parse(r) + if err != nil { + return nil, err + } + jt := findNode(h, JournalTitleTag) + if jt == nil { + return nil, fmt.Errorf("journal not found") + } + + j := htmlj.Journal{ + Title: strings.TrimSpace(jt.FirstChild.Data), + } + var b bytes.Buffer + for s := jt.NextSibling; s != nil; s = s.NextSibling { + if s.Type != html.ElementNode || s.Data != EntryContainerTag { + continue + } + + et := findNode(s, EntryTitleTag) + if et == nil { + continue + } + + title := strings.TrimSpace(et.FirstChild.Data) + if len(title) < 10 { + continue + } + t, err := time.Parse(DateFormat, title[0:10]) + if err != nil { + continue + } + + if j.Updated.Before(t) { + j.Updated = t + } + b.Reset() + s.RemoveChild(et) + for c := s.FirstChild; c != nil; c = c.NextSibling { + if err := html.Render(&b, c); err != nil { + // can this ever happen? + return nil, err + } + } + j.Entries = append(j.Entries, htmlj.Entry{ + Title: title, + Published: t, + Content: strings.TrimSpace(b.String()), + }) + if len(j.Entries) > 2 { + l := len(j.Entries) - 1 + t1, t2, t3 := j.Entries[l-2].Published, + j.Entries[l-1].Published, j.Entries[l].Published + if (t1.Before(t2) && t2.After(t3)) || + (t1.After(t2) && t2.Before(t3)) { + log.Println("warning: non-consecutive log entry:", title) + } + } + } + + return &j, nil +} |
