From f3fcd25e29d1a8c4d355eb650091e2920def658e Mon Sep 17 00:00:00 2001 From: la-ninpre Date: Wed, 10 Dec 2025 22:48:14 +0300 Subject: init --- .gitignore | 1 + LICENCE | 15 +++++++ README.md | 39 ++++++++++++++++++ atom/LICENSE | 29 +++++++++++++ atom/atom.go | 96 ++++++++++++++++++++++++++++++++++++++++++ go.mod | 7 ++++ go.sum | 4 ++ journal/journal.go | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.go | 37 +++++++++++++++++ 9 files changed, 347 insertions(+) create mode 100644 .gitignore create mode 100644 LICENCE create mode 100644 README.md create mode 100644 atom/LICENSE create mode 100644 atom/atom.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 journal/journal.go create mode 100644 main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ae94b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +html-journal \ No newline at end of file diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..0706f33 --- /dev/null +++ b/LICENCE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2020-2022 la-ninpre + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..9d29a4e --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# html-journal + +this is a library for parsing [html-journal](https://journal.miso.town) format. +it also includes a little tool to generate feed from a journal. + +originally made by m15o. this repo includes a more strict version of the format. + +for example, original implementation could parse this: + +``` +

test

+

some text

+
+
+

2025-12-10

+

an entry

+
+
+``` + +but this library will not. it expects a h1 heading, +then some article tags at the same level that contain h2 heading, +which starts with a `YYYY-MM-DD` date. + +## installation + +``` +$ go install git.sr.ht/~la_ninpre/html-journal@latest +``` + +## usage + +``` +$ html-journal file.html 'url' > atom.xml +``` + +## licence + +ISC. see `LICENCE` file in the repo. diff --git a/atom/LICENSE b/atom/LICENSE new file mode 100644 index 0000000..3aae2fb --- /dev/null +++ b/atom/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/atom/atom.go b/atom/atom.go new file mode 100644 index 0000000..f0c49e8 --- /dev/null +++ b/atom/atom.go @@ -0,0 +1,96 @@ +// taken from https://git.sr.ht/~m15o/html-journal +// Copyright 2022 m15o +// Distributed under the terms of a BSD 3-Clause license, see LICENSE file. + +package atom + +import ( + "encoding/xml" + "git.sr.ht/~m15o/htmlj" + "time" +) + +type Feed struct { + XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"` + Title string `xml:"title"` + ID string `xml:"id"` + Link []Link `xml:"link"` + Updated TimeStr `xml:"updated"` + Author *Person `xml:"author"` + Icon string `xml:"icon,omitempty"` + Logo string `xml:"logo,omitempty"` + Subtitle string `xml:"subtitle,omitempty"` + Entry []*Entry `xml:"entry"` +} + +type Entry struct { + Title string `xml:"title"` + ID string `xml:"id"` + Link []Link `xml:"link"` + Published TimeStr `xml:"published"` + Updated TimeStr `xml:"updated"` + Author *Person `xml:"author"` + Summary *Text `xml:"summary"` + Content *Text `xml:"content"` +} + +type Link struct { + Rel string `xml:"rel,attr,omitempty"` + Href string `xml:"href,attr"` + Type string `xml:"type,attr,omitempty"` + HrefLang string `xml:"hreflang,attr,omitempty"` + Title string `xml:"title,attr,omitempty"` + Length uint `xml:"length,attr,omitempty"` +} + +type Person struct { + Name string `xml:"name"` + URI string `xml:"uri,omitempty"` + Email string `xml:"email,omitempty"` + InnerXML string `xml:",innerxml"` +} + +type Text struct { + Type string `xml:"type,attr"` + Body string `xml:",chardata"` +} + +type TimeStr string + +func Time(t time.Time) TimeStr { + return TimeStr(t.Format("2006-01-02T15:04:05-07:00")) +} + +func FeedFromJournal(u string, j *htmlj.Journal) *Feed { + f := &Feed{ + Title: j.Title, + ID: u, + Author: &Person{ + Name: j.Title, + URI: u, + }, + Updated: Time(j.Updated), + Link: []Link{ + { + Rel: "alternate", + Href: u, + }, + }, + } + + for i := 0; i < len(j.Entries); i++ { + p := j.Entries[i] + f.Entry = append(f.Entry, &Entry{ + Title: p.Title, + ID: u + "#" + p.Title, + Published: Time(p.Published), + Updated: Time(p.Published), + Content: &Text{ + Type: "html", + Body: p.Content, + }, + }) + } + + return f +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..8f69ffe --- /dev/null +++ b/go.mod @@ -0,0 +1,7 @@ +module git.sr.ht/~la_ninpre/html-journal + +go 1.18 + +require git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7 + +require golang.org/x/net v0.0.0-20220607020251-c690dde0001d diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..d2e2f5a --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7 h1:IoyJG4XWeCGPt15B+3tW//HYHrqNCdkQERo5nrTVLbQ= +git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7/go.mod h1:Js769MMC92wojjogmjepyFRfYwCR018xUg2worv6RaE= +golang.org/x/net v0.0.0-20220607020251-c690dde0001d h1:4SFsTMi4UahlKoloni7L4eYzhFRifURQLw+yv0QDCx8= +golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= diff --git a/journal/journal.go b/journal/journal.go new file mode 100644 index 0000000..e2e1bea --- /dev/null +++ b/journal/journal.go @@ -0,0 +1,119 @@ +package journal + +import ( + "bytes" + "fmt" + "io" + "log" + "strings" + "time" + + "git.sr.ht/~m15o/htmlj" + "golang.org/x/net/html" +) + +const ( + JournalTitleTag = "h1" + EntryTitleTag = "h2" + EntryContainerTag = "article" + DateFormat = time.DateOnly +) + +// findNode finds first instance of tag s in n and returns pointer to it +func findNode(n *html.Node, s string) *html.Node { + var c *html.Node + if n == nil { + return nil + } + for c = n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode { + if c.Data == s { + break + } + if i := findNode(c, s); i != nil { + c = i + break + } + } + } + return c +} + +// Parse is a stricter version of htmlj.Parse. +// +// This version extracts only the first journal from a given html. +// It only accepts journals that are flat, like: +// +// ... +//

journal

+// ... +//
+//

2025-12-10

+//

test

+//
+// ... +// +// but won't accept other structures, while original will grab +// entries from arbitrarily nested parts of html. +func Parse(r io.Reader) (*htmlj.Journal, error) { + h, err := html.Parse(r) + if err != nil { + return nil, err + } + jt := findNode(h, JournalTitleTag) + if jt == nil { + return nil, fmt.Errorf("journal not found") + } + + j := htmlj.Journal{ + Title: strings.TrimSpace(jt.FirstChild.Data), + } + var b bytes.Buffer + for s := jt.NextSibling; s != nil; s = s.NextSibling { + if s.Type != html.ElementNode || s.Data != EntryContainerTag { + continue + } + + et := findNode(s, EntryTitleTag) + if et == nil { + continue + } + + title := strings.TrimSpace(et.FirstChild.Data) + if len(title) < 10 { + continue + } + t, err := time.Parse(DateFormat, title[0:10]) + if err != nil { + continue + } + + if j.Updated.Before(t) { + j.Updated = t + } + b.Reset() + s.RemoveChild(et) + for c := s.FirstChild; c != nil; c = c.NextSibling { + if err := html.Render(&b, c); err != nil { + // can this ever happen? + return nil, err + } + } + j.Entries = append(j.Entries, htmlj.Entry{ + Title: title, + Published: t, + Content: strings.TrimSpace(b.String()), + }) + if len(j.Entries) > 2 { + l := len(j.Entries) - 1 + t1, t2, t3 := j.Entries[l-2].Published, + j.Entries[l-1].Published, j.Entries[l].Published + if (t1.Before(t2) && t2.After(t3)) || + (t1.After(t2) && t2.Before(t3)) { + log.Println("warning: non-consecutive log entry:", title) + } + } + } + + return &j, nil +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..9f2f622 --- /dev/null +++ b/main.go @@ -0,0 +1,37 @@ +// read html journal from file and output an atom feed +package main + +import ( + "encoding/xml" + "log" + "net/url" + "os" + + "git.sr.ht/~la_ninpre/html-journal/atom" + "git.sr.ht/~la_ninpre/html-journal/journal" +) + +func main() { + if len(os.Args) < 3 { + log.Fatalf("usage: %s file url", os.Args[0]) + } + if _, err := url.Parse(os.Args[2]); err != nil { + log.Printf("malformed url: %v", err) + } + f, err := os.Open(os.Args[1]) + if err != nil { + log.Fatal(err) + } + defer f.Close() + j, err := journal.Parse(f) + if err != nil { + log.Fatal(err) + } + fx := atom.FeedFromJournal(os.Args[2], j) + dat, err := xml.MarshalIndent(fx, "", " ") + if err != nil { + log.Fatal(err) + } + os.Stdout.Write([]byte(xml.Header)) + os.Stdout.Write(dat) +} -- cgit v1.2.3