Much improved html2text (fromhtml)
authorMikael Berthe <mikael@lilotux.net>
Fri, 12 May 2017 23:31:21 +0200
changeset 126 7d712d2bde73
parent 125 d436b88d137b
child 127 2b4d0f198a94
Much improved html2text (fromhtml) Handles tags and mentions specifically; display the URL in the other cases.
printer/html2text/LICENSE
printer/html2text/README.md
printer/html2text/html2text.go
printer/html2text/html2text_test.go
printer/plainprinter.go
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/printer/html2text/LICENSE	Fri May 12 23:31:21 2017 +0200
@@ -0,0 +1,23 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Shawn Goertzen
+Copyright (c) 2017 Mikael Berthe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/printer/html2text/README.md	Fri May 12 23:31:21 2017 +0200
@@ -0,0 +1,10 @@
+# html2text
+
+This is a copy of github.com/sgoertzen/html2text, heavily customized for
+Mastodon's HTML messages.
+
+html2text is an HTML to text converter written in Go.
+This library will strip the html tags from the source and perform clean up on the text.
+This includes things like adding new lines correctly and appending on urls from links.
+
+For Mastodon tags, URLs are not displayed.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/printer/html2text/html2text.go	Fri May 12 23:31:21 2017 +0200
@@ -0,0 +1,148 @@
+// Copyright (c) 2015 Shawn Goertzen
+// Copyright (c) 2017 Mikael Berthe
+//
+// This code mostly comes from github.com/sgoertzen/html2text,
+// with some specific but intrusive changes for Mastodon HTML messages.
+// For example, links are not displayed for hashtags and mentions,
+// and links alone are displayed for the other cases.
+//
+// Licensed under the MIT license.
+// Please see the LICENSE file is this directory.
+
+package html2text
+
+import (
+	"bytes"
+	"errors"
+	"golang.org/x/net/html"
+	"strings"
+)
+
+var breakers = map[string]bool{
+	"br":  true,
+	"div": true,
+	"tr":  true,
+	"li":  true,
+	"p":   true,
+}
+
+// Textify turns an HTML body into a text string
+func Textify(body string) (string, error) {
+	r := strings.NewReader(body)
+	doc, err := html.Parse(r)
+	if err != nil {
+		return "", errors.New("unable to parse the html")
+	}
+	var buffer bytes.Buffer
+	process(doc, &buffer, "")
+
+	s := strings.TrimSpace(buffer.String())
+	return s, nil
+}
+
+func process(n *html.Node, b *bytes.Buffer, class string) {
+	processChildren := true
+
+	if n.Type == html.ElementNode && n.Data == "head" {
+		return
+	} else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil {
+		anchor(n, b, class)
+		processChildren = false
+	} else if n.Type == html.TextNode {
+		// Clean up data
+		cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1)
+
+		// Heuristics to add a whitespace character...
+		var prevSpace, nextSpace bool // hint if previous/next char is a space
+		var last byte
+		bl := b.Len()
+		if bl > 0 {
+			last = b.Bytes()[bl-1]
+			if last == ' ' {
+				prevSpace = true
+			}
+		}
+		if len(cleanData) > 0 && cleanData[0] == ' ' {
+			nextSpace = true
+		}
+		if prevSpace && nextSpace {
+			b.WriteString(cleanData[1:]) // Trim 1 space
+		} else {
+			if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace {
+				b.WriteString(" ")
+			}
+			b.WriteString(cleanData)
+		}
+	}
+
+	if processChildren {
+		var class string
+		if n.Type == html.ElementNode && n.Data == "span" {
+			for _, attr := range n.Attr {
+				if attr.Key == "class" {
+					class = attr.Val
+					break
+				}
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			process(c, b, class)
+		}
+	}
+
+	if b.Len() > 0 {
+		bl := b.Len()
+		last := b.Bytes()[bl-1]
+		if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] {
+			// Remove previous space
+			for last == ' ' {
+				bl--
+				b.Truncate(bl)
+				if bl > 0 {
+					last = b.Bytes()[bl-1]
+				} else {
+					last = '\x00'
+				}
+			}
+			b.WriteString("\n")
+		}
+	}
+}
+
+func anchor(n *html.Node, b *bytes.Buffer, class string) {
+	bl := b.Len()
+	var last byte
+	if bl > 0 {
+		last = b.Bytes()[bl-1]
+	}
+
+	var tmpbuf bytes.Buffer
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		process(c, &tmpbuf, class)
+	}
+
+	if class == "tag" || class == "h-card" || last == '@' {
+		b.Write(tmpbuf.Bytes())
+		return
+	}
+
+	// Add heading space if needed
+	if last != ' ' && last != '\n' {
+		b.WriteString(" ")
+	}
+
+	s := tmpbuf.String()
+	if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") {
+		b.WriteString(s) // Tag or mention: display content
+		return
+	}
+
+	// Display href link
+	for _, attr := range n.Attr {
+		if attr.Key == "href" {
+			link := n.Attr[0].Val
+			b.WriteString(link)
+			break
+		}
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/printer/html2text/html2text_test.go	Fri May 12 23:31:21 2017 +0200
@@ -0,0 +1,177 @@
+package html2text
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestTextify(t *testing.T) {
+	expected := "body\nbody2"
+	r, e := Textify("<html><body><b>body</b><br/>body2</body></html>")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyDiv(t *testing.T) {
+	expected := "first\nsecond"
+	r, e := Textify("<div>first</div>second")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+/*
+func TestTextifyLink(t *testing.T) {
+	expected := "somelink (link: someurl)"
+	r, e := Textify("<a href=\"someurl\">somelink</a>")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+*/
+
+func TestTextifyDontDuplicateLink(t *testing.T) {
+	expected := "www.awesome.com"
+	r, e := Textify("<a href=\"www.awesome.com\">www.awesome.com</a>")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifySpaces(t *testing.T) {
+	expected := "hello"
+	r, e := Textify("<div> hello  </div>")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+/*  I don't think we want that for Mastodon...
+func TestTextifySpacesMultiple(t *testing.T) {
+	expected := "hello goodbye"
+	r, e := Textify("<span> hello  </span><span>   goodbye   </span>")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+*/
+
+func TestTextifyNonBreakingSpace(t *testing.T) {
+	expected := "a   a"
+	r, e := Textify("a &nbsp; a")
+	assert.Equal(t, expected, r)
+	assert.Nil(t, e)
+}
+
+func TestTextifyLimitedNewLines(t *testing.T) {
+	expected := "abc\nxyz"
+	r, e := Textify("abc <br/> <br/> <br/> <br/>xyz")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyTable(t *testing.T) {
+	expected := `Join by phone
+1-877-668-4490 Call-in toll-free number (US/Canada)
+1-408-792-6300 Call-in toll number (US/Canada)
+Access code: 111 111 111
+https://akqa.webex.com/akqa/globalcallin.php?serviceType=MC&ED=299778282&tollFree=1 | http://www.webex.com/pdf/tollfree_restrictions.pdf`
+
+	test := `<table width="747" style="width:448.2pt;"> <col width="747" style="width:448.2pt;"> <tbody> <tr> <td><font face="Arial" color="#666666"><b>Join by phone</b></font></td> </tr> <tr> <td><font face="Arial" size="3" color="#666666"><span style="font-size:11.5pt;"><b>1-877-668-4490</b> Call-in toll-free number (US/Canada)</span></font></td> </tr> <tr> <td><font face="Arial" size="3" color="#666666"><span style="font-size:11.5pt;"><b>1-408-792-6300</b> Call-in toll number (US/Canada)</span></font></td> </tr> <tr> <td><font face="Arial" size="3" color="#666666"><span style="font-size:11.5pt;">Access code: 111 111 111</span></font></td> </tr> <tr> <td><a href="https://akqa.webex.com/akqa/globalcallin.php?serviceType=MC&amp;ED=299778282&amp;tollFree=1"><font face="Arial" size="2" color="#00AFF9"><span style="font-size:10pt;"><u>Global call-in numbers</u></span></font></a><font face="Arial" size="3" color="#666666"><span style="font-size:11.5pt;"> &nbsp;|&nbsp; </span></font><a href="http://www.webex.com/pdf/tollfree_restrictions.pdf"><font face="Arial" size="2" color="#00AFF9"><span style="font-size:10pt;"><u>Toll-free calling restrictions</u></span></font></a></td> </tr> </tbody> </table>`
+
+	r, e := Textify(test)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyComment(t *testing.T) {
+	expected := "this should appear"
+	r, e := Textify("<!-- this should not appear -->this should appear")
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyCommentInHead(t *testing.T) {
+	expected := "qwerty"
+
+	body := `<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <meta name="Generator" content="Microsoft Exchange Server"> <!-- converted from rtf --><style><!-- .EmailQuote { margin-left: 1pt; padding-left: 4pt; border-left: #800000 2px solid; } --></style> </head> <body>qwerty</body> </html>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyLists(t *testing.T) {
+	expected := "a\nb\n1\n2"
+
+	body := `<ol><li>a</li><li>b</li></ol><ul><li>1</li><li>2</li></ul>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonSample1(t *testing.T) {
+	expected := "@magi hello \\U0001F607 @TEST"
+
+	body := `<p><span class=\"h-card\"><a href=\"https://example.com/@magi\">@<span>magi</span></a></span> hello \U0001F607 <span class=\"h-card\"><a href=\"https://example.com/@TEST\">@<span>TEST</span></a></span></p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonSample2(t *testing.T) {
+	expected := "@cadey It looks good at first glance\n\"case <-stop\"  Actually you don't listen to stop channel, you close it if you want to stop the listener."
+
+	body := `<p><span class="h-card"><a href="https://www.example.com/@cadey" class="u-url mention">@<span>cadey</span></a></span> It looks good at first glance</p><p>&quot;case &lt;-stop&quot;  Actually you don&apos;t listen to stop channel, you close it if you want to stop the listener.</p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonSample3(t *testing.T) {
+	expected := "From timeline: Materials research creates potential for improved computer chips and transistors #phys #physics ..."
+
+	body := `From timeline: Materials research creates potential for improved computer chips and transistors #<span class="tag"><a href="https://social.oalm.gub.uy/tag/phys">phys</a></span> #<span class="tag"><a href="https://social.oalm.gub.uy/tag/physics">physics</a></span><p>...</p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonSample4(t *testing.T) {
+	expected := "Vous reprendrez bien un peu de #Tolkein ?\n#Arte +7 propose un ensemble de 6 vidéos en plus du documentaire:\nhttp://www.arte.tv/fr/videos/RC-014610/tolkien/"
+
+	body := `<p>Vous reprendrez bien un peu de <a href="https://framapiaf.org/tags/tolkein">#<span>Tolkein</span></a> ?<br><a href="https://framapiaf.org/tags/arte">#<span>Arte</span></a>+7 propose un ensemble de 6 vidéos en plus du documentaire:</p><p><a href="http://www.arte.tv/fr/videos/RC-014610/tolkien/"><span class="invisible">http://www.</span><span class="ellipsis">arte.tv/fr/videos/RC-014610/to</span><span class="invisible">lkien/</span></a></p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonMentionAndTag(t *testing.T) {
+	expected := "@ACh Mais heu ! Moi aussi je fais du #TootRadio de gens morts il y a 5 siècles. Gesulado, Charpentier, Mireille Mathieu..."
+
+	body := `<p><span class="h-card"><a href="https://mamot.fr/@ACh">@<span>ACh</span></a></span> Mais heu ! Moi aussi je fais du <a href="https://example.com/tags/tootradio">#<span>TootRadio</span></a> de gens morts il y a 5 siècles. Gesulado, Charpentier, Mireille Mathieu...</p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonLinkSpacing(t *testing.T) {
+	expected := "\"Twitter\" https://twitter.com/holly/status/123456789012345678"
+
+	body := `<p>"Twitter" <a href="https://twitter.com/holly/status/123456789012345678"><span class="invisible">https://</span><span class="ellipsis">twitter.com/holly/status/86266</span><span class="invisible">1234567890123</span></a></p>`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
+
+func TestTextifyMastodonMentionGNUSocial(t *testing.T) {
+	expected := "@username Hello."
+
+	body := `@<a href="https://example.com/user/12345">username</a> Hello.`
+
+	r, e := Textify(body)
+	assert.Nil(t, e)
+	assert.Equal(t, expected, r)
+}
--- a/printer/plainprinter.go	Fri May 12 00:09:04 2017 +0200
+++ b/printer/plainprinter.go	Fri May 12 23:31:21 2017 +0200
@@ -12,10 +12,10 @@
 	"reflect"
 	"time"
 
-	"github.com/jaytaylor/html2text"
 	"github.com/m0t0k1ch1/gomif"
 
 	"github.com/McKael/madon"
+	"github.com/McKael/madonctl/printer/html2text"
 )
 
 // PlainPrinter is the default "plain text" printer
@@ -120,7 +120,7 @@
 }
 
 func html2string(h string) string {
-	t, err := html2text.FromString(h)
+	t, err := html2text.Textify(h)
 	if err == nil {
 		return t
 	}