printer/html2text/html2text.go
changeset 126 7d712d2bde73
child 132 4bf4f6ce268e
equal deleted inserted replaced
125:d436b88d137b 126:7d712d2bde73
       
     1 // Copyright (c) 2015 Shawn Goertzen
       
     2 // Copyright (c) 2017 Mikael Berthe
       
     3 //
       
     4 // This code mostly comes from github.com/sgoertzen/html2text,
       
     5 // with some specific but intrusive changes for Mastodon HTML messages.
       
     6 // For example, links are not displayed for hashtags and mentions,
       
     7 // and links alone are displayed for the other cases.
       
     8 //
       
     9 // Licensed under the MIT license.
       
    10 // Please see the LICENSE file is this directory.
       
    11 
       
    12 package html2text
       
    13 
       
    14 import (
       
    15 	"bytes"
       
    16 	"errors"
       
    17 	"golang.org/x/net/html"
       
    18 	"strings"
       
    19 )
       
    20 
       
    21 var breakers = map[string]bool{
       
    22 	"br":  true,
       
    23 	"div": true,
       
    24 	"tr":  true,
       
    25 	"li":  true,
       
    26 	"p":   true,
       
    27 }
       
    28 
       
    29 // Textify turns an HTML body into a text string
       
    30 func Textify(body string) (string, error) {
       
    31 	r := strings.NewReader(body)
       
    32 	doc, err := html.Parse(r)
       
    33 	if err != nil {
       
    34 		return "", errors.New("unable to parse the html")
       
    35 	}
       
    36 	var buffer bytes.Buffer
       
    37 	process(doc, &buffer, "")
       
    38 
       
    39 	s := strings.TrimSpace(buffer.String())
       
    40 	return s, nil
       
    41 }
       
    42 
       
    43 func process(n *html.Node, b *bytes.Buffer, class string) {
       
    44 	processChildren := true
       
    45 
       
    46 	if n.Type == html.ElementNode && n.Data == "head" {
       
    47 		return
       
    48 	} else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil {
       
    49 		anchor(n, b, class)
       
    50 		processChildren = false
       
    51 	} else if n.Type == html.TextNode {
       
    52 		// Clean up data
       
    53 		cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1)
       
    54 
       
    55 		// Heuristics to add a whitespace character...
       
    56 		var prevSpace, nextSpace bool // hint if previous/next char is a space
       
    57 		var last byte
       
    58 		bl := b.Len()
       
    59 		if bl > 0 {
       
    60 			last = b.Bytes()[bl-1]
       
    61 			if last == ' ' {
       
    62 				prevSpace = true
       
    63 			}
       
    64 		}
       
    65 		if len(cleanData) > 0 && cleanData[0] == ' ' {
       
    66 			nextSpace = true
       
    67 		}
       
    68 		if prevSpace && nextSpace {
       
    69 			b.WriteString(cleanData[1:]) // Trim 1 space
       
    70 		} else {
       
    71 			if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace {
       
    72 				b.WriteString(" ")
       
    73 			}
       
    74 			b.WriteString(cleanData)
       
    75 		}
       
    76 	}
       
    77 
       
    78 	if processChildren {
       
    79 		var class string
       
    80 		if n.Type == html.ElementNode && n.Data == "span" {
       
    81 			for _, attr := range n.Attr {
       
    82 				if attr.Key == "class" {
       
    83 					class = attr.Val
       
    84 					break
       
    85 				}
       
    86 			}
       
    87 		}
       
    88 		for c := n.FirstChild; c != nil; c = c.NextSibling {
       
    89 			process(c, b, class)
       
    90 		}
       
    91 	}
       
    92 
       
    93 	if b.Len() > 0 {
       
    94 		bl := b.Len()
       
    95 		last := b.Bytes()[bl-1]
       
    96 		if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] {
       
    97 			// Remove previous space
       
    98 			for last == ' ' {
       
    99 				bl--
       
   100 				b.Truncate(bl)
       
   101 				if bl > 0 {
       
   102 					last = b.Bytes()[bl-1]
       
   103 				} else {
       
   104 					last = '\x00'
       
   105 				}
       
   106 			}
       
   107 			b.WriteString("\n")
       
   108 		}
       
   109 	}
       
   110 }
       
   111 
       
   112 func anchor(n *html.Node, b *bytes.Buffer, class string) {
       
   113 	bl := b.Len()
       
   114 	var last byte
       
   115 	if bl > 0 {
       
   116 		last = b.Bytes()[bl-1]
       
   117 	}
       
   118 
       
   119 	var tmpbuf bytes.Buffer
       
   120 	for c := n.FirstChild; c != nil; c = c.NextSibling {
       
   121 		process(c, &tmpbuf, class)
       
   122 	}
       
   123 
       
   124 	if class == "tag" || class == "h-card" || last == '@' {
       
   125 		b.Write(tmpbuf.Bytes())
       
   126 		return
       
   127 	}
       
   128 
       
   129 	// Add heading space if needed
       
   130 	if last != ' ' && last != '\n' {
       
   131 		b.WriteString(" ")
       
   132 	}
       
   133 
       
   134 	s := tmpbuf.String()
       
   135 	if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") {
       
   136 		b.WriteString(s) // Tag or mention: display content
       
   137 		return
       
   138 	}
       
   139 
       
   140 	// Display href link
       
   141 	for _, attr := range n.Attr {
       
   142 		if attr.Key == "href" {
       
   143 			link := n.Attr[0].Val
       
   144 			b.WriteString(link)
       
   145 			break
       
   146 		}
       
   147 	}
       
   148 }