zorg/vendor/github.com/mmcdole/gofeed/internal/shared/xmlbase.go

259 lines
5.6 KiB
Go
Raw Normal View History

2020-10-08 15:33:26 -04:00
package shared
import (
"bytes"
"fmt"
"golang.org/x/net/html"
"net/url"
"strings"
"github.com/mmcdole/goxpp"
)
var (
// HTML attributes which contain URIs
// https://pythonhosted.org/feedparser/resolving-relative-links.html
// To catch every possible URI attribute is non-trivial:
// https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
htmlURIAttrs = map[string]bool{
"action": true,
"background": true,
"cite": true,
"codebase": true,
"data": true,
"href": true,
"poster": true,
"profile": true,
"scheme": true,
"src": true,
"uri": true,
"usemap": true,
}
)
type urlStack []*url.URL
func (s *urlStack) push(u *url.URL) {
*s = append([]*url.URL{u}, *s...)
}
func (s *urlStack) pop() *url.URL {
if s == nil || len(*s) == 0 {
return nil
}
var top *url.URL
top, *s = (*s)[0], (*s)[1:]
return top
}
func (s *urlStack) top() *url.URL {
if s == nil || len(*s) == 0 {
return nil
}
return (*s)[0]
}
type XMLBase struct {
stack urlStack
URIAttrs map[string]bool
}
// FindRoot iterates through the tokens of an xml document until
// it encounters its first StartTag event. It returns an error
// if it reaches EndDocument before finding a tag.
func (b *XMLBase) FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
for {
event, err = b.NextTag(p)
if err != nil {
return event, err
}
if event == xpp.StartTag {
break
}
if event == xpp.EndDocument {
return event, fmt.Errorf("Failed to find root node before document end.")
}
}
return
}
// XMLBase.NextTag iterates through the tokens until it reaches a StartTag or
// EndTag It maintains the urlStack upon encountering StartTag and EndTags, so
// that the top of the stack (accessible through the CurrentBase() and
// CurrentBaseURL() methods) is the absolute base URI by which relative URIs
// should be resolved.
//
// NextTag is similar to goxpp's NextTag method except it wont throw an error
// if the next immediate token isnt a Start/EndTag. Instead, it will continue
// to consume tokens until it hits a Start/EndTag or EndDocument.
func (b *XMLBase) NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
for {
if p.Event == xpp.EndTag {
// Pop xml:base after each end tag
b.pop()
}
event, err = p.Next()
if err != nil {
return event, err
}
if event == xpp.EndTag {
break
}
if event == xpp.StartTag {
base := parseBase(p)
err = b.push(base)
if err != nil {
return
}
err = b.resolveAttrs(p)
if err != nil {
return
}
break
}
if event == xpp.EndDocument {
return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
}
}
return
}
func parseBase(p *xpp.XMLPullParser) string {
xmlURI := "http://www.w3.org/XML/1998/namespace"
for _, attr := range p.Attrs {
if attr.Name.Local == "base" && attr.Name.Space == xmlURI {
return attr.Value
}
}
return ""
}
func (b *XMLBase) push(base string) error {
newURL, err := url.Parse(base)
if err != nil {
return err
}
topURL := b.CurrentBaseURL()
if topURL != nil {
newURL = topURL.ResolveReference(newURL)
}
b.stack.push(newURL)
return nil
}
// returns the popped base URL
func (b *XMLBase) pop() string {
url := b.stack.pop()
if url != nil {
return url.String()
}
return ""
}
func (b *XMLBase) CurrentBaseURL() *url.URL {
return b.stack.top()
}
func (b *XMLBase) CurrentBase() string {
if url := b.CurrentBaseURL(); url != nil {
return url.String()
}
return ""
}
// resolve the given string as a URL relative to current base
func (b *XMLBase) ResolveURL(u string) (string, error) {
if b.CurrentBase() == "" {
return u, nil
}
relURL, err := url.Parse(u)
if err != nil {
return u, err
}
curr := b.CurrentBaseURL()
if curr.Path != "" && u != "" && curr.Path[len(curr.Path)-1] != '/' {
// There's no reason someone would use a path in xml:base if they
// didn't mean for it to be a directory
curr.Path = curr.Path + "/"
}
absURL := b.CurrentBaseURL().ResolveReference(relURL)
return absURL.String(), nil
}
// resolve relative URI attributes according to xml:base
func (b *XMLBase) resolveAttrs(p *xpp.XMLPullParser) error {
for i, attr := range p.Attrs {
lowerName := strings.ToLower(attr.Name.Local)
if b.URIAttrs[lowerName] {
absURL, err := b.ResolveURL(attr.Value)
if err != nil {
return err
}
p.Attrs[i].Value = absURL
}
}
return nil
}
// Transforms html by resolving any relative URIs in attributes
// if an error occurs during parsing or serialization, then the original string
// is returned along with the error.
func (b *XMLBase) ResolveHTML(relHTML string) (string, error) {
if b.CurrentBase() == "" {
return relHTML, nil
}
htmlReader := strings.NewReader(relHTML)
doc, err := html.Parse(htmlReader)
if err != nil {
return relHTML, err
}
var visit func(*html.Node)
// recursively traverse HTML resolving any relative URIs in attributes
visit = func(n *html.Node) {
if n.Type == html.ElementNode {
for i, a := range n.Attr {
if htmlURIAttrs[a.Key] {
absVal, err := b.ResolveURL(a.Val)
if err == nil {
n.Attr[i].Val = absVal
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
visit(c)
}
}
visit(doc)
var w bytes.Buffer
err = html.Render(&w, doc)
if err != nil {
return relHTML, err
}
// html.Render() always writes a complete html5 document, so strip the html
// and body tags
absHTML := w.String()
absHTML = strings.TrimPrefix(absHTML, "<html><head></head><body>")
absHTML = strings.TrimSuffix(absHTML, "</body></html>")
return absHTML, err
}