# HG changeset patch # User Mikael Berthe # Date 1494624681 -7200 # Node ID 7d712d2bde73b8d77ff8aa3716f45d2700041789 # Parent d436b88d137b33b170ea19dfc799f2044f6eeaee Much improved html2text (fromhtml) Handles tags and mentions specifically; display the URL in the other cases. diff -r d436b88d137b -r 7d712d2bde73 printer/html2text/LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/printer/html2text/LICENSE Fri May 12 23:31:21 2017 +0200 @@ -0,0 +1,23 @@ +The MIT License (MIT) + +Copyright (c) 2015 Shawn Goertzen +Copyright (c) 2017 Mikael Berthe + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff -r d436b88d137b -r 7d712d2bde73 printer/html2text/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/printer/html2text/README.md Fri May 12 23:31:21 2017 +0200 @@ -0,0 +1,10 @@ +# html2text + +This is a copy of github.com/sgoertzen/html2text, heavily customized for +Mastodon's HTML messages. + +html2text is an HTML to text converter written in Go. +This library will strip the html tags from the source and perform clean up on the text. +This includes things like adding new lines correctly and appending on urls from links. + +For Mastodon tags, URLs are not displayed. diff -r d436b88d137b -r 7d712d2bde73 printer/html2text/html2text.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/printer/html2text/html2text.go Fri May 12 23:31:21 2017 +0200 @@ -0,0 +1,148 @@ +// Copyright (c) 2015 Shawn Goertzen +// Copyright (c) 2017 Mikael Berthe +// +// This code mostly comes from github.com/sgoertzen/html2text, +// with some specific but intrusive changes for Mastodon HTML messages. +// For example, links are not displayed for hashtags and mentions, +// and links alone are displayed for the other cases. +// +// Licensed under the MIT license. +// Please see the LICENSE file is this directory. + +package html2text + +import ( + "bytes" + "errors" + "golang.org/x/net/html" + "strings" +) + +var breakers = map[string]bool{ + "br": true, + "div": true, + "tr": true, + "li": true, + "p": true, +} + +// Textify turns an HTML body into a text string +func Textify(body string) (string, error) { + r := strings.NewReader(body) + doc, err := html.Parse(r) + if err != nil { + return "", errors.New("unable to parse the html") + } + var buffer bytes.Buffer + process(doc, &buffer, "") + + s := strings.TrimSpace(buffer.String()) + return s, nil +} + +func process(n *html.Node, b *bytes.Buffer, class string) { + processChildren := true + + if n.Type == html.ElementNode && n.Data == "head" { + return + } else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil { + anchor(n, b, class) + processChildren = false + } else if n.Type == html.TextNode { + // Clean up data + cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1) + + // Heuristics to add a whitespace character... + var prevSpace, nextSpace bool // hint if previous/next char is a space + var last byte + bl := b.Len() + if bl > 0 { + last = b.Bytes()[bl-1] + if last == ' ' { + prevSpace = true + } + } + if len(cleanData) > 0 && cleanData[0] == ' ' { + nextSpace = true + } + if prevSpace && nextSpace { + b.WriteString(cleanData[1:]) // Trim 1 space + } else { + if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace { + b.WriteString(" ") + } + b.WriteString(cleanData) + } + } + + if processChildren { + var class string + if n.Type == html.ElementNode && n.Data == "span" { + for _, attr := range n.Attr { + if attr.Key == "class" { + class = attr.Val + break + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + process(c, b, class) + } + } + + if b.Len() > 0 { + bl := b.Len() + last := b.Bytes()[bl-1] + if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] { + // Remove previous space + for last == ' ' { + bl-- + b.Truncate(bl) + if bl > 0 { + last = b.Bytes()[bl-1] + } else { + last = '\x00' + } + } + b.WriteString("\n") + } + } +} + +func anchor(n *html.Node, b *bytes.Buffer, class string) { + bl := b.Len() + var last byte + if bl > 0 { + last = b.Bytes()[bl-1] + } + + var tmpbuf bytes.Buffer + for c := n.FirstChild; c != nil; c = c.NextSibling { + process(c, &tmpbuf, class) + } + + if class == "tag" || class == "h-card" || last == '@' { + b.Write(tmpbuf.Bytes()) + return + } + + // Add heading space if needed + if last != ' ' && last != '\n' { + b.WriteString(" ") + } + + s := tmpbuf.String() + if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") { + b.WriteString(s) // Tag or mention: display content + return + } + + // Display href link + for _, attr := range n.Attr { + if attr.Key == "href" { + link := n.Attr[0].Val + b.WriteString(link) + break + } + } +} diff -r d436b88d137b -r 7d712d2bde73 printer/html2text/html2text_test.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/printer/html2text/html2text_test.go Fri May 12 23:31:21 2017 +0200 @@ -0,0 +1,177 @@ +package html2text + +import ( + "github.com/stretchr/testify/assert" + "testing" +) + +func TestTextify(t *testing.T) { + expected := "body\nbody2" + r, e := Textify("body
body2") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyDiv(t *testing.T) { + expected := "first\nsecond" + r, e := Textify("
first
second") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +/* +func TestTextifyLink(t *testing.T) { + expected := "somelink (link: someurl)" + r, e := Textify("somelink") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} +*/ + +func TestTextifyDontDuplicateLink(t *testing.T) { + expected := "www.awesome.com" + r, e := Textify("www.awesome.com") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifySpaces(t *testing.T) { + expected := "hello" + r, e := Textify("
hello
") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +/* I don't think we want that for Mastodon... +func TestTextifySpacesMultiple(t *testing.T) { + expected := "hello goodbye" + r, e := Textify(" hello goodbye ") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} +*/ + +func TestTextifyNonBreakingSpace(t *testing.T) { + expected := "a a" + r, e := Textify("a   a") + assert.Equal(t, expected, r) + assert.Nil(t, e) +} + +func TestTextifyLimitedNewLines(t *testing.T) { + expected := "abc\nxyz" + r, e := Textify("abc



xyz") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyTable(t *testing.T) { + expected := `Join by phone +1-877-668-4490 Call-in toll-free number (US/Canada) +1-408-792-6300 Call-in toll number (US/Canada) +Access code: 111 111 111 +https://akqa.webex.com/akqa/globalcallin.php?serviceType=MC&ED=299778282&tollFree=1 | http://www.webex.com/pdf/tollfree_restrictions.pdf` + + test := `
Join by phone
1-877-668-4490 Call-in toll-free number (US/Canada)
1-408-792-6300 Call-in toll number (US/Canada)
Access code: 111 111 111
Global call-in numbers  |  Toll-free calling restrictions
` + + r, e := Textify(test) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyComment(t *testing.T) { + expected := "this should appear" + r, e := Textify("this should appear") + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyCommentInHead(t *testing.T) { + expected := "qwerty" + + body := ` qwerty ` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyLists(t *testing.T) { + expected := "a\nb\n1\n2" + + body := `
  1. a
  2. b
` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonSample1(t *testing.T) { + expected := "@magi hello \\U0001F607 @TEST" + + body := `

@magi hello \U0001F607 @TEST

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonSample2(t *testing.T) { + expected := "@cadey It looks good at first glance\n\"case <-stop\" Actually you don't listen to stop channel, you close it if you want to stop the listener." + + body := `

@cadey It looks good at first glance

"case <-stop" Actually you don't listen to stop channel, you close it if you want to stop the listener.

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonSample3(t *testing.T) { + expected := "From timeline: Materials research creates potential for improved computer chips and transistors #phys #physics ..." + + body := `From timeline: Materials research creates potential for improved computer chips and transistors #phys #physics

...

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonSample4(t *testing.T) { + expected := "Vous reprendrez bien un peu de #Tolkein ?\n#Arte +7 propose un ensemble de 6 vidéos en plus du documentaire:\nhttp://www.arte.tv/fr/videos/RC-014610/tolkien/" + + body := `

Vous reprendrez bien un peu de #Tolkein ?
#Arte+7 propose un ensemble de 6 vidéos en plus du documentaire:

arte.tv/fr/videos/RC-014610/to

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonMentionAndTag(t *testing.T) { + expected := "@ACh Mais heu ! Moi aussi je fais du #TootRadio de gens morts il y a 5 siècles. Gesulado, Charpentier, Mireille Mathieu..." + + body := `

@ACh Mais heu ! Moi aussi je fais du #TootRadio de gens morts il y a 5 siècles. Gesulado, Charpentier, Mireille Mathieu...

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonLinkSpacing(t *testing.T) { + expected := "\"Twitter\" https://twitter.com/holly/status/123456789012345678" + + body := `

"Twitter" twitter.com/holly/status/86266

` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} + +func TestTextifyMastodonMentionGNUSocial(t *testing.T) { + expected := "@username Hello." + + body := `@username Hello.` + + r, e := Textify(body) + assert.Nil(t, e) + assert.Equal(t, expected, r) +} diff -r d436b88d137b -r 7d712d2bde73 printer/plainprinter.go --- a/printer/plainprinter.go Fri May 12 00:09:04 2017 +0200 +++ b/printer/plainprinter.go Fri May 12 23:31:21 2017 +0200 @@ -12,10 +12,10 @@ "reflect" "time" - "github.com/jaytaylor/html2text" "github.com/m0t0k1ch1/gomif" "github.com/McKael/madon" + "github.com/McKael/madonctl/printer/html2text" ) // PlainPrinter is the default "plain text" printer @@ -120,7 +120,7 @@ } func html2string(h string) string { - t, err := html2text.FromString(h) + t, err := html2text.Textify(h) if err == nil { return t }