|
1 // Copyright (c) 2015 Shawn Goertzen |
|
2 // Copyright (c) 2017 Mikael Berthe |
|
3 // |
|
4 // This code mostly comes from github.com/sgoertzen/html2text, |
|
5 // with some specific but intrusive changes for Mastodon HTML messages. |
|
6 // For example, links are not displayed for hashtags and mentions, |
|
7 // and links alone are displayed for the other cases. |
|
8 // |
|
9 // Licensed under the MIT license. |
|
10 // Please see the LICENSE file is this directory. |
|
11 |
|
12 package html2text |
|
13 |
|
14 import ( |
|
15 "bytes" |
|
16 "errors" |
|
17 "golang.org/x/net/html" |
|
18 "strings" |
|
19 ) |
|
20 |
|
21 var breakers = map[string]bool{ |
|
22 "br": true, |
|
23 "div": true, |
|
24 "tr": true, |
|
25 "li": true, |
|
26 "p": true, |
|
27 } |
|
28 |
|
29 // Textify turns an HTML body into a text string |
|
30 func Textify(body string) (string, error) { |
|
31 r := strings.NewReader(body) |
|
32 doc, err := html.Parse(r) |
|
33 if err != nil { |
|
34 return "", errors.New("unable to parse the html") |
|
35 } |
|
36 var buffer bytes.Buffer |
|
37 process(doc, &buffer, "") |
|
38 |
|
39 s := strings.TrimSpace(buffer.String()) |
|
40 return s, nil |
|
41 } |
|
42 |
|
43 func process(n *html.Node, b *bytes.Buffer, class string) { |
|
44 processChildren := true |
|
45 |
|
46 if n.Type == html.ElementNode && n.Data == "head" { |
|
47 return |
|
48 } else if n.Type == html.ElementNode && n.Data == "a" && n.FirstChild != nil { |
|
49 anchor(n, b, class) |
|
50 processChildren = false |
|
51 } else if n.Type == html.TextNode { |
|
52 // Clean up data |
|
53 cleanData := strings.Replace(strings.Trim(n.Data, " \t"), "\u00a0", " ", -1) |
|
54 |
|
55 // Heuristics to add a whitespace character... |
|
56 var prevSpace, nextSpace bool // hint if previous/next char is a space |
|
57 var last byte |
|
58 bl := b.Len() |
|
59 if bl > 0 { |
|
60 last = b.Bytes()[bl-1] |
|
61 if last == ' ' { |
|
62 prevSpace = true |
|
63 } |
|
64 } |
|
65 if len(cleanData) > 0 && cleanData[0] == ' ' { |
|
66 nextSpace = true |
|
67 } |
|
68 if prevSpace && nextSpace { |
|
69 b.WriteString(cleanData[1:]) // Trim 1 space |
|
70 } else { |
|
71 if bl > 0 && last != '\n' && last != '@' && last != '#' && !prevSpace && !nextSpace { |
|
72 b.WriteString(" ") |
|
73 } |
|
74 b.WriteString(cleanData) |
|
75 } |
|
76 } |
|
77 |
|
78 if processChildren { |
|
79 var class string |
|
80 if n.Type == html.ElementNode && n.Data == "span" { |
|
81 for _, attr := range n.Attr { |
|
82 if attr.Key == "class" { |
|
83 class = attr.Val |
|
84 break |
|
85 } |
|
86 } |
|
87 } |
|
88 for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
89 process(c, b, class) |
|
90 } |
|
91 } |
|
92 |
|
93 if b.Len() > 0 { |
|
94 bl := b.Len() |
|
95 last := b.Bytes()[bl-1] |
|
96 if last != '\n' && n.Type == html.ElementNode && breakers[n.Data] { |
|
97 // Remove previous space |
|
98 for last == ' ' { |
|
99 bl-- |
|
100 b.Truncate(bl) |
|
101 if bl > 0 { |
|
102 last = b.Bytes()[bl-1] |
|
103 } else { |
|
104 last = '\x00' |
|
105 } |
|
106 } |
|
107 b.WriteString("\n") |
|
108 } |
|
109 } |
|
110 } |
|
111 |
|
112 func anchor(n *html.Node, b *bytes.Buffer, class string) { |
|
113 bl := b.Len() |
|
114 var last byte |
|
115 if bl > 0 { |
|
116 last = b.Bytes()[bl-1] |
|
117 } |
|
118 |
|
119 var tmpbuf bytes.Buffer |
|
120 for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
121 process(c, &tmpbuf, class) |
|
122 } |
|
123 |
|
124 if class == "tag" || class == "h-card" || last == '@' { |
|
125 b.Write(tmpbuf.Bytes()) |
|
126 return |
|
127 } |
|
128 |
|
129 // Add heading space if needed |
|
130 if last != ' ' && last != '\n' { |
|
131 b.WriteString(" ") |
|
132 } |
|
133 |
|
134 s := tmpbuf.String() |
|
135 if strings.HasPrefix(s, "#") || strings.HasPrefix(s, "@") { |
|
136 b.WriteString(s) // Tag or mention: display content |
|
137 return |
|
138 } |
|
139 |
|
140 // Display href link |
|
141 for _, attr := range n.Attr { |
|
142 if attr.Key == "href" { |
|
143 link := n.Attr[0].Val |
|
144 b.WriteString(link) |
|
145 break |
|
146 } |
|
147 } |
|
148 } |