diff -r d436b88d137b -r 7d712d2bde73 printer/html2text/html2text.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/printer/html2text/html2text.go Fri May 12 23:31:21 2017 +0200 @@ -0,0 +1,148 @@ +// Copyright (c) 2015 Shawn Goertzen +// Copyright (c) 2017 Mikael Berthe +// +// This code mostly comes from github.com/sgoertzen/html2text, +// with some specific but intrusive changes for Mastodon HTML messages. +// For example, links are not displayed for hashtags and mentions, +// and links alone are displayed for the other cases. +// +// Licensed under the MIT license. +// Please see the LICENSE file is this directory. + +package html2text + +import ( + "bytes" + "errors" + "golang.org/x/net/html" + "strings" +) + +var breakers = map[string]bool{ + "br": true, + "div": true, + "tr": true, + "li": true, + "p": true, +} + +// Textify turns an HTML body into a text string +func Textify(body string) (string, error) { + r := strings.NewReader(body) + doc, err := html.Parse(r) + if err != nil { + return "", errors.New("unable to parse the html") + } + var buffer bytes.Buffer + process(doc, &buffer, "") + + s := strings.TrimSpace(buffer.String()) + return s, nil +} + +func process(n *html.Node, b *bytes.Buffer, class string) { + processChildren := true + + if n.Type == html.ElementNode && n.Data == "head" { + return + } else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil { + anchor(n, b, class) + processChildren = false + } else if n.Type == html.TextNode { + // Clean up data + cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1) + + // Heuristics to add a whitespace character... + var prevSpace, nextSpace bool // hint if previous/next char is a space + var last byte + bl := b.Len() + if bl > 0 { + last = b.Bytes()[bl-1] + if last == ' ' { + prevSpace = true + } + } + if len(cleanData) > 0 && cleanData[0] == ' ' { + nextSpace = true + } + if prevSpace && nextSpace { + b.WriteString(cleanData[1:]) // Trim 1 space + } else { + if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace { + b.WriteString(" ") + } + b.WriteString(cleanData) + } + } + + if processChildren { + var class string + if n.Type == html.ElementNode && n.Data == "span" { + for _, attr := range n.Attr { + if attr.Key == "class" { + class = attr.Val + break + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + process(c, b, class) + } + } + + if b.Len() > 0 { + bl := b.Len() + last := b.Bytes()[bl-1] + if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] { + // Remove previous space + for last == ' ' { + bl-- + b.Truncate(bl) + if bl > 0 { + last = b.Bytes()[bl-1] + } else { + last = '\x00' + } + } + b.WriteString("\n") + } + } +} + +func anchor(n *html.Node, b *bytes.Buffer, class string) { + bl := b.Len() + var last byte + if bl > 0 { + last = b.Bytes()[bl-1] + } + + var tmpbuf bytes.Buffer + for c := n.FirstChild; c != nil; c = c.NextSibling { + process(c, &tmpbuf, class) + } + + if class == "tag" || class == "h-card" || last == '@' { + b.Write(tmpbuf.Bytes()) + return + } + + // Add heading space if needed + if last != ' ' && last != '\n' { + b.WriteString(" ") + } + + s := tmpbuf.String() + if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") { + b.WriteString(s) // Tag or mention: display content + return + } + + // Display href link + for _, attr := range n.Attr { + if attr.Key == "href" { + link := n.Attr[0].Val + b.WriteString(link) + break + } + } +}