aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorla-ninpre <aaoth@aaoth.xyz>2025-12-10 22:48:14 +0300
committerla-ninpre <aaoth@aaoth.xyz>2025-12-10 22:48:14 +0300
commitf3fcd25e29d1a8c4d355eb650091e2920def658e (patch)
tree7f537b44a47359e5437e67251b3a5d4de1bb642c
downloadhtml-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.tar.gz
html-journal-f3fcd25e29d1a8c4d355eb650091e2920def658e.zip
init
-rw-r--r--.gitignore1
-rw-r--r--LICENCE15
-rw-r--r--README.md39
-rw-r--r--atom/LICENSE29
-rw-r--r--atom/atom.go96
-rw-r--r--go.mod7
-rw-r--r--go.sum4
-rw-r--r--journal/journal.go119
-rw-r--r--main.go37
9 files changed, 347 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0ae94b5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+html-journal \ No newline at end of file
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..0706f33
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2020-2022 la-ninpre
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9d29a4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,39 @@
+# html-journal
+
+this is a library for parsing [html-journal](https://journal.miso.town) format.
+it also includes a little tool to generate feed from a journal.
+
+originally made by m15o. this repo includes a more strict version of the format.
+
+for example, original implementation could parse this:
+
+```
+<h1>test</h1>
+<p>some text</p>
+<div>
+ <section>
+ <h2>2025-12-10</h2>
+ <p>an entry</p>
+ </section>
+</div>
+```
+
+but this library will not. it expects a h1 heading,
+then some article tags at the same level that contain h2 heading,
+which starts with a `YYYY-MM-DD` date.
+
+## installation
+
+```
+$ go install git.sr.ht/~la_ninpre/html-journal@latest
+```
+
+## usage
+
+```
+$ html-journal file.html 'url' > atom.xml
+```
+
+## licence
+
+ISC. see `LICENCE` file in the repo.
diff --git a/atom/LICENSE b/atom/LICENSE
new file mode 100644
index 0000000..3aae2fb
--- /dev/null
+++ b/atom/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file
diff --git a/atom/atom.go b/atom/atom.go
new file mode 100644
index 0000000..f0c49e8
--- /dev/null
+++ b/atom/atom.go
@@ -0,0 +1,96 @@
+// taken from https://git.sr.ht/~m15o/html-journal
+// Copyright 2022 m15o
+// Distributed under the terms of a BSD 3-Clause license, see LICENSE file.
+
+package atom
+
+import (
+ "encoding/xml"
+ "git.sr.ht/~m15o/htmlj"
+ "time"
+)
+
+type Feed struct {
+ XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
+ Title string `xml:"title"`
+ ID string `xml:"id"`
+ Link []Link `xml:"link"`
+ Updated TimeStr `xml:"updated"`
+ Author *Person `xml:"author"`
+ Icon string `xml:"icon,omitempty"`
+ Logo string `xml:"logo,omitempty"`
+ Subtitle string `xml:"subtitle,omitempty"`
+ Entry []*Entry `xml:"entry"`
+}
+
+type Entry struct {
+ Title string `xml:"title"`
+ ID string `xml:"id"`
+ Link []Link `xml:"link"`
+ Published TimeStr `xml:"published"`
+ Updated TimeStr `xml:"updated"`
+ Author *Person `xml:"author"`
+ Summary *Text `xml:"summary"`
+ Content *Text `xml:"content"`
+}
+
+type Link struct {
+ Rel string `xml:"rel,attr,omitempty"`
+ Href string `xml:"href,attr"`
+ Type string `xml:"type,attr,omitempty"`
+ HrefLang string `xml:"hreflang,attr,omitempty"`
+ Title string `xml:"title,attr,omitempty"`
+ Length uint `xml:"length,attr,omitempty"`
+}
+
+type Person struct {
+ Name string `xml:"name"`
+ URI string `xml:"uri,omitempty"`
+ Email string `xml:"email,omitempty"`
+ InnerXML string `xml:",innerxml"`
+}
+
+type Text struct {
+ Type string `xml:"type,attr"`
+ Body string `xml:",chardata"`
+}
+
+type TimeStr string
+
+func Time(t time.Time) TimeStr {
+ return TimeStr(t.Format("2006-01-02T15:04:05-07:00"))
+}
+
+func FeedFromJournal(u string, j *htmlj.Journal) *Feed {
+ f := &Feed{
+ Title: j.Title,
+ ID: u,
+ Author: &Person{
+ Name: j.Title,
+ URI: u,
+ },
+ Updated: Time(j.Updated),
+ Link: []Link{
+ {
+ Rel: "alternate",
+ Href: u,
+ },
+ },
+ }
+
+ for i := 0; i < len(j.Entries); i++ {
+ p := j.Entries[i]
+ f.Entry = append(f.Entry, &Entry{
+ Title: p.Title,
+ ID: u + "#" + p.Title,
+ Published: Time(p.Published),
+ Updated: Time(p.Published),
+ Content: &Text{
+ Type: "html",
+ Body: p.Content,
+ },
+ })
+ }
+
+ return f
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..8f69ffe
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,7 @@
+module git.sr.ht/~la_ninpre/html-journal
+
+go 1.18
+
+require git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7
+
+require golang.org/x/net v0.0.0-20220607020251-c690dde0001d
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..d2e2f5a
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,4 @@
+git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7 h1:IoyJG4XWeCGPt15B+3tW//HYHrqNCdkQERo5nrTVLbQ=
+git.sr.ht/~m15o/htmlj v0.0.0-20220709084050-c36dda5901f7/go.mod h1:Js769MMC92wojjogmjepyFRfYwCR018xUg2worv6RaE=
+golang.org/x/net v0.0.0-20220607020251-c690dde0001d h1:4SFsTMi4UahlKoloni7L4eYzhFRifURQLw+yv0QDCx8=
+golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
diff --git a/journal/journal.go b/journal/journal.go
new file mode 100644
index 0000000..e2e1bea
--- /dev/null
+++ b/journal/journal.go
@@ -0,0 +1,119 @@
+package journal
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "log"
+ "strings"
+ "time"
+
+ "git.sr.ht/~m15o/htmlj"
+ "golang.org/x/net/html"
+)
+
+const (
+ JournalTitleTag = "h1"
+ EntryTitleTag = "h2"
+ EntryContainerTag = "article"
+ DateFormat = time.DateOnly
+)
+
+// findNode finds first instance of tag s in n and returns pointer to it
+func findNode(n *html.Node, s string) *html.Node {
+ var c *html.Node
+ if n == nil {
+ return nil
+ }
+ for c = n.FirstChild; c != nil; c = c.NextSibling {
+ if c.Type == html.ElementNode {
+ if c.Data == s {
+ break
+ }
+ if i := findNode(c, s); i != nil {
+ c = i
+ break
+ }
+ }
+ }
+ return c
+}
+
+// Parse is a stricter version of htmlj.Parse.
+//
+// This version extracts only the first journal from a given html.
+// It only accepts journals that are flat, like:
+//
+// ...
+// <h1>journal</h1>
+// ...
+// <article>
+// <h2>2025-12-10</h2>
+// <p>test</p>
+// </article>
+// ...
+//
+// but won't accept other structures, while original will grab
+// entries from arbitrarily nested parts of html.
+func Parse(r io.Reader) (*htmlj.Journal, error) {
+ h, err := html.Parse(r)
+ if err != nil {
+ return nil, err
+ }
+ jt := findNode(h, JournalTitleTag)
+ if jt == nil {
+ return nil, fmt.Errorf("journal not found")
+ }
+
+ j := htmlj.Journal{
+ Title: strings.TrimSpace(jt.FirstChild.Data),
+ }
+ var b bytes.Buffer
+ for s := jt.NextSibling; s != nil; s = s.NextSibling {
+ if s.Type != html.ElementNode || s.Data != EntryContainerTag {
+ continue
+ }
+
+ et := findNode(s, EntryTitleTag)
+ if et == nil {
+ continue
+ }
+
+ title := strings.TrimSpace(et.FirstChild.Data)
+ if len(title) < 10 {
+ continue
+ }
+ t, err := time.Parse(DateFormat, title[0:10])
+ if err != nil {
+ continue
+ }
+
+ if j.Updated.Before(t) {
+ j.Updated = t
+ }
+ b.Reset()
+ s.RemoveChild(et)
+ for c := s.FirstChild; c != nil; c = c.NextSibling {
+ if err := html.Render(&b, c); err != nil {
+ // can this ever happen?
+ return nil, err
+ }
+ }
+ j.Entries = append(j.Entries, htmlj.Entry{
+ Title: title,
+ Published: t,
+ Content: strings.TrimSpace(b.String()),
+ })
+ if len(j.Entries) > 2 {
+ l := len(j.Entries) - 1
+ t1, t2, t3 := j.Entries[l-2].Published,
+ j.Entries[l-1].Published, j.Entries[l].Published
+ if (t1.Before(t2) && t2.After(t3)) ||
+ (t1.After(t2) && t2.Before(t3)) {
+ log.Println("warning: non-consecutive log entry:", title)
+ }
+ }
+ }
+
+ return &j, nil
+}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..9f2f622
--- /dev/null
+++ b/main.go
@@ -0,0 +1,37 @@
+// read html journal from file and output an atom feed
+package main
+
+import (
+ "encoding/xml"
+ "log"
+ "net/url"
+ "os"
+
+ "git.sr.ht/~la_ninpre/html-journal/atom"
+ "git.sr.ht/~la_ninpre/html-journal/journal"
+)
+
+func main() {
+ if len(os.Args) < 3 {
+ log.Fatalf("usage: %s file url", os.Args[0])
+ }
+ if _, err := url.Parse(os.Args[2]); err != nil {
+ log.Printf("malformed url: %v", err)
+ }
+ f, err := os.Open(os.Args[1])
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ j, err := journal.Parse(f)
+ if err != nil {
+ log.Fatal(err)
+ }
+ fx := atom.FeedFromJournal(os.Args[2], j)
+ dat, err := xml.MarshalIndent(fx, "", " ")
+ if err != nil {
+ log.Fatal(err)
+ }
+ os.Stdout.Write([]byte(xml.Header))
+ os.Stdout.Write(dat)
+}