Xpath scraping from URL (#285)

* Add xpath performer and scene scraping * Add studio scraping * Refactor code * Fix compile error * Don't overwrite performer URL during a scrape
2025-12-17 20:34:37 +03:00 · 2020-01-05 03:39:33 +11:00
parent d35f3a9b10
commit 7fdaccf669
93 changed files with 174400 additions and 4 deletions
--- a/vendor/github.com/antchfx/htmlquery/.gitignore
+++ b/vendor/github.com/antchfx/htmlquery/.gitignore
@@ -0,0 +1,32 @@
+# vscode
+.vscode
+debug
+*.test
+
+./build
+
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
--- a/vendor/github.com/antchfx/htmlquery/.travis.yml
+++ b/vendor/github.com/antchfx/htmlquery/.travis.yml
@@ -0,0 +1,16 @@
+language: go
+
+go:
+  - 1.6
+  - 1.7
+  - 1.8
+
+install:
+  - go get golang.org/x/net/html/charset
+  - go get golang.org/x/net/html
+  - go get github.com/antchfx/xpath
+  - go get github.com/mattn/goveralls
+  - go get github.com/golang/groupcache
+
+script:
+  - $HOME/gopath/bin/goveralls -service=travis-ci
--- a/vendor/github.com/antchfx/htmlquery/LICENSE
+++ b/vendor/github.com/antchfx/htmlquery/LICENSE
@@ -0,0 +1,17 @@
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/vendor/github.com/antchfx/htmlquery/README.md
+++ b/vendor/github.com/antchfx/htmlquery/README.md
@@ -0,0 +1,158 @@
+htmlquery
+====
+[![Build Status](https://travis-ci.org/antchfx/htmlquery.svg?branch=master)](https://travis-ci.org/antchfx/htmlquery)
+[![Coverage Status](https://coveralls.io/repos/github/antchfx/htmlquery/badge.svg?branch=master)](https://coveralls.io/github/antchfx/htmlquery?branch=master)
+[![GoDoc](https://godoc.org/github.com/antchfx/htmlquery?status.svg)](https://godoc.org/github.com/antchfx/htmlquery)
+[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/htmlquery)](https://goreportcard.com/report/github.com/antchfx/htmlquery)
+
+Overview
+====
+
+`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
+
+`htmlquery` build-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. enable caching can avoid re-compile XPath expression each query. 
+
+Installation
+====
+
+```
+go get github.com/antchfx/htmlquery
+```
+
+Getting Started
+====
+
+#### Query, returns matched elements or error.
+
+```go
+nodes, err := htmlquery.QueryAll(doc, "//a")
+if err != nil {
+	panic(`not a valid XPath expression.`)
+}
+```
+
+#### Load HTML document from URL.
+
+```go
+doc, err := htmlquery.LoadURL("http://example.com/")
+```
+
+#### Load HTML from document.
+
+```go
+filePath := "/home/user/sample.html"
+doc, err := htmlquery.LoadDoc(filePath)
+```
+
+#### Load HTML document from string.
+
+```go
+s := `<html>....</html>`
+doc, err := htmlquery.Parse(strings.NewReader(s))
+```
+
+#### Find all A elements.
+
+```go
+list := htmlquery.Find(doc, "//a")
+```
+
+#### Find all A elements that have `href` attribute.
+
+```go
+list := range htmlquery.Find(doc, "//a[@href]")	
+```
+
+#### Find all A elements with `href` attribute and only return `href` value.
+
+```go
+list := range htmlquery.Find(doc, "//a/@href")	
+for n := range list{
+	fmt.Println(htmlquery.InnerText(n)) // output @href value without A element.
+}
+```
+
+### Find the third A element.
+
+```go
+a := htmlquery.FindOne(doc, "//a[3]")
+```
+
+#### Evaluate the number of all IMG element.
+
+```go
+expr, _ := xpath.Compile("count(//img)")
+v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
+fmt.Printf("total count is %f", v)
+```
+
+
+FAQ
+====
+
+#### `Find()` vs `QueryAll()`, which is better?
+
+`Find` and `QueryAll` both do the same things, searches all of matched html nodes.
+The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
+
+#### Can I save my query expression object for the next query?
+
+Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
+
+Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
+
+#### Disable caching feature
+
+```
+htmlquery.DisableSelectorCache = true
+```
+
+Changelogs
+===
+
+2019-11-19 
+- Add built-in query object cache feature, avoid re-compilation for the same query string. [#16](https://github.com/antchfx/htmlquery/issues/16)
+- Added LoadDoc [18](https://github.com/antchfx/htmlquery/pull/18)
+
+2019-10-05 
+- Add new methods that compatible with invalid XPath expression error: `QueryAll` and `Query`.
+- Add `QuerySelector` and `QuerySelectorAll` methods, supported reused your query object.
+
+2019-02-04
+- [#7](https://github.com/antchfx/htmlquery/issues/7) Removed deprecated `FindEach()` and `FindEachWithBreak()` methods.
+
+2018-12-28
+- Avoid adding duplicate elements to list for `Find()` method. [#6](https://github.com/antchfx/htmlquery/issues/6)
+
+Tutorial
+===
+
+```go
+func main() {
+	doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
+	if err != nil {
+		panic(err)
+	}
+	// Find all news item.
+	list, err := htmlquery.QueryAll(doc, "//ol/li")
+	if err != nil {
+		panic(err)
+	}
+	for i, n := range list {
+		a := htmlquery.FindOne(n, "//a")
+		fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
+	}
+}
+```
+
+List of supported XPath query packages
+===
+| Name                                              | Description                               |
+| ------------------------------------------------- | ----------------------------------------- |
+| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
+| [xmlquery](https://github.com/antchfx/xmlquery)   | XPath query package for the XML document  |
+| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
+
+Questions
+===
+Please let me know if you have any questions.
--- a/vendor/github.com/antchfx/htmlquery/cache.go
+++ b/vendor/github.com/antchfx/htmlquery/cache.go
@@ -0,0 +1,40 @@
+package htmlquery
+
+import (
+	"sync"
+
+	"github.com/golang/groupcache/lru"
+
+	"github.com/antchfx/xpath"
+)
+
+// DisableSelectorCache will disable caching for the query selector if value is true.
+var DisableSelectorCache = false
+
+// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
+// Will disable caching if SelectorCacheMaxEntries <= 0.
+var SelectorCacheMaxEntries = 50
+
+var (
+	cacheOnce sync.Once
+	cache     *lru.Cache
+)
+
+func getQuery(expr string) (*xpath.Expr, error) {
+	if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
+		return xpath.Compile(expr)
+	}
+	cacheOnce.Do(func() {
+		cache = lru.New(50)
+	})
+	if v, ok := cache.Get(expr); ok {
+		return v.(*xpath.Expr), nil
+	}
+	v, err := xpath.Compile(expr)
+	if err != nil {
+		return nil, err
+	}
+	cache.Add(expr, v)
+	return v, nil
+
+}
--- a/vendor/github.com/antchfx/htmlquery/query.go
+++ b/vendor/github.com/antchfx/htmlquery/query.go
@@ -0,0 +1,338 @@
+/*
+Package htmlquery provides extract data from HTML documents using XPath expression.
+*/
+package htmlquery
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+
+	"github.com/antchfx/xpath"
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/charset"
+)
+
+var _ xpath.NodeNavigator = &NodeNavigator{}
+
+// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
+func CreateXPathNavigator(top *html.Node) *NodeNavigator {
+	return &NodeNavigator{curr: top, root: top, attr: -1}
+}
+
+// Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
+//
+// See `QueryAll()` function.
+func Find(top *html.Node, expr string) []*html.Node {
+	nodes, err := QueryAll(top, expr)
+	if err != nil {
+		panic(err)
+	}
+	return nodes
+}
+
+// FindOne is like Query but will panics if the expression `expr` cannot be parsed.
+// See `Query()` function.
+func FindOne(top *html.Node, expr string) *html.Node {
+	node, err := Query(top, expr)
+	if err != nil {
+		panic(err)
+	}
+	return node
+}
+
+// QueryAll searches the html.Node that matches by the specified XPath expr.
+// Return an error if the expression `expr` cannot be parsed.
+func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
+	exp, err := getQuery(expr)
+	if err != nil {
+		return nil, err
+	}
+	nodes := QuerySelectorAll(top, exp)
+	return nodes, nil
+}
+
+// Query searches the html.Node that matches by the specified XPath expr,
+// and return the first element of matched html.Node.
+//
+// Return an error if the expression `expr` cannot be parsed.
+func Query(top *html.Node, expr string) (*html.Node, error) {
+	exp, err := getQuery(expr)
+	if err != nil {
+		return nil, err
+	}
+	return QuerySelector(top, exp), nil
+}
+
+// QuerySelector returns the first matched html.Node by the specified XPath selector.
+func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
+	t := selector.Select(CreateXPathNavigator(top))
+	if t.MoveNext() {
+		return getCurrentNode(t.Current().(*NodeNavigator))
+	}
+	return nil
+}
+
+// QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
+func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
+	var elems []*html.Node
+	t := selector.Select(CreateXPathNavigator(top))
+	for t.MoveNext() {
+		nav := t.Current().(*NodeNavigator)
+		n := getCurrentNode(nav)
+		// avoid adding duplicate nodes.
+		if len(elems) > 0 && (elems[0] == n || (nav.NodeType() == xpath.AttributeNode &&
+			nav.LocalName() == elems[0].Data && nav.Value() == InnerText(elems[0]))) {
+			continue
+		}
+		elems = append(elems, n)
+	}
+	return elems
+}
+
+// LoadURL loads the HTML document from the specified URL.
+func LoadURL(url string) (*html.Node, error) {
+	resp, err := http.Get(url)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
+	if err != nil {
+		return nil, err
+	}
+	return html.Parse(r)
+}
+
+// LoadDoc loads the HTML document from the specified file path.
+func LoadDoc(path string) (*html.Node, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	return html.Parse(bufio.NewReader(f))
+}
+
+func getCurrentNode(n *NodeNavigator) *html.Node {
+	if n.NodeType() == xpath.AttributeNode {
+		childNode := &html.Node{
+			Type: html.TextNode,
+			Data: n.Value(),
+		}
+		return &html.Node{
+			Type:       html.ElementNode,
+			Data:       n.LocalName(),
+			FirstChild: childNode,
+			LastChild:  childNode,
+		}
+
+	}
+	return n.curr
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+func Parse(r io.Reader) (*html.Node, error) {
+	return html.Parse(r)
+}
+
+// InnerText returns the text between the start and end tags of the object.
+func InnerText(n *html.Node) string {
+	var output func(*bytes.Buffer, *html.Node)
+	output = func(buf *bytes.Buffer, n *html.Node) {
+		switch n.Type {
+		case html.TextNode:
+			buf.WriteString(n.Data)
+			return
+		case html.CommentNode:
+			return
+		}
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			output(buf, child)
+		}
+	}
+
+	var buf bytes.Buffer
+	output(&buf, n)
+	return buf.String()
+}
+
+// SelectAttr returns the attribute value with the specified name.
+func SelectAttr(n *html.Node, name string) (val string) {
+	if n == nil {
+		return
+	}
+	if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
+		return InnerText(n)
+	}
+	for _, attr := range n.Attr {
+		if attr.Key == name {
+			val = attr.Val
+			break
+		}
+	}
+	return
+}
+
+// OutputHTML returns the text including tags name.
+func OutputHTML(n *html.Node, self bool) string {
+	var buf bytes.Buffer
+	if self {
+		html.Render(&buf, n)
+	} else {
+		for n := n.FirstChild; n != nil; n = n.NextSibling {
+			html.Render(&buf, n)
+		}
+	}
+	return buf.String()
+}
+
+type NodeNavigator struct {
+	root, curr *html.Node
+	attr       int
+}
+
+func (h *NodeNavigator) Current() *html.Node {
+	return h.curr
+}
+
+func (h *NodeNavigator) NodeType() xpath.NodeType {
+	switch h.curr.Type {
+	case html.CommentNode:
+		return xpath.CommentNode
+	case html.TextNode:
+		return xpath.TextNode
+	case html.DocumentNode:
+		return xpath.RootNode
+	case html.ElementNode:
+		if h.attr != -1 {
+			return xpath.AttributeNode
+		}
+		return xpath.ElementNode
+	case html.DoctypeNode:
+		// ignored <!DOCTYPE HTML> declare and as Root-Node type.
+		return xpath.RootNode
+	}
+	panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
+}
+
+func (h *NodeNavigator) LocalName() string {
+	if h.attr != -1 {
+		return h.curr.Attr[h.attr].Key
+	}
+	return h.curr.Data
+}
+
+func (*NodeNavigator) Prefix() string {
+	return ""
+}
+
+func (h *NodeNavigator) Value() string {
+	switch h.curr.Type {
+	case html.CommentNode:
+		return h.curr.Data
+	case html.ElementNode:
+		if h.attr != -1 {
+			return h.curr.Attr[h.attr].Val
+		}
+		return InnerText(h.curr)
+	case html.TextNode:
+		return h.curr.Data
+	}
+	return ""
+}
+
+func (h *NodeNavigator) Copy() xpath.NodeNavigator {
+	n := *h
+	return &n
+}
+
+func (h *NodeNavigator) MoveToRoot() {
+	h.curr = h.root
+}
+
+func (h *NodeNavigator) MoveToParent() bool {
+	if h.attr != -1 {
+		h.attr = -1
+		return true
+	} else if node := h.curr.Parent; node != nil {
+		h.curr = node
+		return true
+	}
+	return false
+}
+
+func (h *NodeNavigator) MoveToNextAttribute() bool {
+	if h.attr >= len(h.curr.Attr)-1 {
+		return false
+	}
+	h.attr++
+	return true
+}
+
+func (h *NodeNavigator) MoveToChild() bool {
+	if h.attr != -1 {
+		return false
+	}
+	if node := h.curr.FirstChild; node != nil {
+		h.curr = node
+		return true
+	}
+	return false
+}
+
+func (h *NodeNavigator) MoveToFirst() bool {
+	if h.attr != -1 || h.curr.PrevSibling == nil {
+		return false
+	}
+	for {
+		node := h.curr.PrevSibling
+		if node == nil {
+			break
+		}
+		h.curr = node
+	}
+	return true
+}
+
+func (h *NodeNavigator) String() string {
+	return h.Value()
+}
+
+func (h *NodeNavigator) MoveToNext() bool {
+	if h.attr != -1 {
+		return false
+	}
+	if node := h.curr.NextSibling; node != nil {
+		h.curr = node
+		return true
+	}
+	return false
+}
+
+func (h *NodeNavigator) MoveToPrevious() bool {
+	if h.attr != -1 {
+		return false
+	}
+	if node := h.curr.PrevSibling; node != nil {
+		h.curr = node
+		return true
+	}
+	return false
+}
+
+func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
+	node, ok := other.(*NodeNavigator)
+	if !ok || node.root != h.root {
+		return false
+	}
+
+	h.curr = node.curr
+	h.attr = node.attr
+	return true
+}