mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 20:34:37 +03:00
Xpath scraping from URL (#285)
* Add xpath performer and scene scraping * Add studio scraping * Refactor code * Fix compile error * Don't overwrite performer URL during a scrape
This commit is contained in:
32
vendor/github.com/antchfx/htmlquery/.gitignore
generated
vendored
Normal file
32
vendor/github.com/antchfx/htmlquery/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# vscode
|
||||
.vscode
|
||||
debug
|
||||
*.test
|
||||
|
||||
./build
|
||||
|
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
||||
16
vendor/github.com/antchfx/htmlquery/.travis.yml
generated
vendored
Normal file
16
vendor/github.com/antchfx/htmlquery/.travis.yml
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
language: go
|
||||
|
||||
go:
|
||||
- 1.6
|
||||
- 1.7
|
||||
- 1.8
|
||||
|
||||
install:
|
||||
- go get golang.org/x/net/html/charset
|
||||
- go get golang.org/x/net/html
|
||||
- go get github.com/antchfx/xpath
|
||||
- go get github.com/mattn/goveralls
|
||||
- go get github.com/golang/groupcache
|
||||
|
||||
script:
|
||||
- $HOME/gopath/bin/goveralls -service=travis-ci
|
||||
17
vendor/github.com/antchfx/htmlquery/LICENSE
generated
vendored
Normal file
17
vendor/github.com/antchfx/htmlquery/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
158
vendor/github.com/antchfx/htmlquery/README.md
generated
vendored
Normal file
158
vendor/github.com/antchfx/htmlquery/README.md
generated
vendored
Normal file
@@ -0,0 +1,158 @@
|
||||
htmlquery
|
||||
====
|
||||
[](https://travis-ci.org/antchfx/htmlquery)
|
||||
[](https://coveralls.io/github/antchfx/htmlquery?branch=master)
|
||||
[](https://godoc.org/github.com/antchfx/htmlquery)
|
||||
[](https://goreportcard.com/report/github.com/antchfx/htmlquery)
|
||||
|
||||
Overview
|
||||
====
|
||||
|
||||
`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
|
||||
|
||||
`htmlquery` build-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. enable caching can avoid re-compile XPath expression each query.
|
||||
|
||||
Installation
|
||||
====
|
||||
|
||||
```
|
||||
go get github.com/antchfx/htmlquery
|
||||
```
|
||||
|
||||
Getting Started
|
||||
====
|
||||
|
||||
#### Query, returns matched elements or error.
|
||||
|
||||
```go
|
||||
nodes, err := htmlquery.QueryAll(doc, "//a")
|
||||
if err != nil {
|
||||
panic(`not a valid XPath expression.`)
|
||||
}
|
||||
```
|
||||
|
||||
#### Load HTML document from URL.
|
||||
|
||||
```go
|
||||
doc, err := htmlquery.LoadURL("http://example.com/")
|
||||
```
|
||||
|
||||
#### Load HTML from document.
|
||||
|
||||
```go
|
||||
filePath := "/home/user/sample.html"
|
||||
doc, err := htmlquery.LoadDoc(filePath)
|
||||
```
|
||||
|
||||
#### Load HTML document from string.
|
||||
|
||||
```go
|
||||
s := `<html>....</html>`
|
||||
doc, err := htmlquery.Parse(strings.NewReader(s))
|
||||
```
|
||||
|
||||
#### Find all A elements.
|
||||
|
||||
```go
|
||||
list := htmlquery.Find(doc, "//a")
|
||||
```
|
||||
|
||||
#### Find all A elements that have `href` attribute.
|
||||
|
||||
```go
|
||||
list := range htmlquery.Find(doc, "//a[@href]")
|
||||
```
|
||||
|
||||
#### Find all A elements with `href` attribute and only return `href` value.
|
||||
|
||||
```go
|
||||
list := range htmlquery.Find(doc, "//a/@href")
|
||||
for n := range list{
|
||||
fmt.Println(htmlquery.InnerText(n)) // output @href value without A element.
|
||||
}
|
||||
```
|
||||
|
||||
### Find the third A element.
|
||||
|
||||
```go
|
||||
a := htmlquery.FindOne(doc, "//a[3]")
|
||||
```
|
||||
|
||||
#### Evaluate the number of all IMG element.
|
||||
|
||||
```go
|
||||
expr, _ := xpath.Compile("count(//img)")
|
||||
v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
|
||||
fmt.Printf("total count is %f", v)
|
||||
```
|
||||
|
||||
|
||||
FAQ
|
||||
====
|
||||
|
||||
#### `Find()` vs `QueryAll()`, which is better?
|
||||
|
||||
`Find` and `QueryAll` both do the same things, searches all of matched html nodes.
|
||||
The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
|
||||
|
||||
#### Can I save my query expression object for the next query?
|
||||
|
||||
Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
|
||||
|
||||
Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
|
||||
|
||||
#### Disable caching feature
|
||||
|
||||
```
|
||||
htmlquery.DisableSelectorCache = true
|
||||
```
|
||||
|
||||
Changelogs
|
||||
===
|
||||
|
||||
2019-11-19
|
||||
- Add built-in query object cache feature, avoid re-compilation for the same query string. [#16](https://github.com/antchfx/htmlquery/issues/16)
|
||||
- Added LoadDoc [18](https://github.com/antchfx/htmlquery/pull/18)
|
||||
|
||||
2019-10-05
|
||||
- Add new methods that compatible with invalid XPath expression error: `QueryAll` and `Query`.
|
||||
- Add `QuerySelector` and `QuerySelectorAll` methods, supported reused your query object.
|
||||
|
||||
2019-02-04
|
||||
- [#7](https://github.com/antchfx/htmlquery/issues/7) Removed deprecated `FindEach()` and `FindEachWithBreak()` methods.
|
||||
|
||||
2018-12-28
|
||||
- Avoid adding duplicate elements to list for `Find()` method. [#6](https://github.com/antchfx/htmlquery/issues/6)
|
||||
|
||||
Tutorial
|
||||
===
|
||||
|
||||
```go
|
||||
func main() {
|
||||
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
// Find all news item.
|
||||
list, err := htmlquery.QueryAll(doc, "//ol/li")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for i, n := range list {
|
||||
a := htmlquery.FindOne(n, "//a")
|
||||
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
List of supported XPath query packages
|
||||
===
|
||||
| Name | Description |
|
||||
| ------------------------------------------------- | ----------------------------------------- |
|
||||
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
|
||||
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
|
||||
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
|
||||
|
||||
Questions
|
||||
===
|
||||
Please let me know if you have any questions.
|
||||
40
vendor/github.com/antchfx/htmlquery/cache.go
generated
vendored
Normal file
40
vendor/github.com/antchfx/htmlquery/cache.go
generated
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
package htmlquery
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/golang/groupcache/lru"
|
||||
|
||||
"github.com/antchfx/xpath"
|
||||
)
|
||||
|
||||
// DisableSelectorCache will disable caching for the query selector if value is true.
|
||||
var DisableSelectorCache = false
|
||||
|
||||
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
|
||||
// Will disable caching if SelectorCacheMaxEntries <= 0.
|
||||
var SelectorCacheMaxEntries = 50
|
||||
|
||||
var (
|
||||
cacheOnce sync.Once
|
||||
cache *lru.Cache
|
||||
)
|
||||
|
||||
func getQuery(expr string) (*xpath.Expr, error) {
|
||||
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
|
||||
return xpath.Compile(expr)
|
||||
}
|
||||
cacheOnce.Do(func() {
|
||||
cache = lru.New(50)
|
||||
})
|
||||
if v, ok := cache.Get(expr); ok {
|
||||
return v.(*xpath.Expr), nil
|
||||
}
|
||||
v, err := xpath.Compile(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cache.Add(expr, v)
|
||||
return v, nil
|
||||
|
||||
}
|
||||
338
vendor/github.com/antchfx/htmlquery/query.go
generated
vendored
Normal file
338
vendor/github.com/antchfx/htmlquery/query.go
generated
vendored
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
Package htmlquery provides extract data from HTML documents using XPath expression.
|
||||
*/
|
||||
package htmlquery
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/antchfx/xpath"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
var _ xpath.NodeNavigator = &NodeNavigator{}
|
||||
|
||||
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
|
||||
func CreateXPathNavigator(top *html.Node) *NodeNavigator {
|
||||
return &NodeNavigator{curr: top, root: top, attr: -1}
|
||||
}
|
||||
|
||||
// Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
|
||||
//
|
||||
// See `QueryAll()` function.
|
||||
func Find(top *html.Node, expr string) []*html.Node {
|
||||
nodes, err := QueryAll(top, expr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
// FindOne is like Query but will panics if the expression `expr` cannot be parsed.
|
||||
// See `Query()` function.
|
||||
func FindOne(top *html.Node, expr string) *html.Node {
|
||||
node, err := Query(top, expr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return node
|
||||
}
|
||||
|
||||
// QueryAll searches the html.Node that matches by the specified XPath expr.
|
||||
// Return an error if the expression `expr` cannot be parsed.
|
||||
func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
|
||||
exp, err := getQuery(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nodes := QuerySelectorAll(top, exp)
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// Query searches the html.Node that matches by the specified XPath expr,
|
||||
// and return the first element of matched html.Node.
|
||||
//
|
||||
// Return an error if the expression `expr` cannot be parsed.
|
||||
func Query(top *html.Node, expr string) (*html.Node, error) {
|
||||
exp, err := getQuery(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return QuerySelector(top, exp), nil
|
||||
}
|
||||
|
||||
// QuerySelector returns the first matched html.Node by the specified XPath selector.
|
||||
func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
|
||||
t := selector.Select(CreateXPathNavigator(top))
|
||||
if t.MoveNext() {
|
||||
return getCurrentNode(t.Current().(*NodeNavigator))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
|
||||
func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
|
||||
var elems []*html.Node
|
||||
t := selector.Select(CreateXPathNavigator(top))
|
||||
for t.MoveNext() {
|
||||
nav := t.Current().(*NodeNavigator)
|
||||
n := getCurrentNode(nav)
|
||||
// avoid adding duplicate nodes.
|
||||
if len(elems) > 0 && (elems[0] == n || (nav.NodeType() == xpath.AttributeNode &&
|
||||
nav.LocalName() == elems[0].Data && nav.Value() == InnerText(elems[0]))) {
|
||||
continue
|
||||
}
|
||||
elems = append(elems, n)
|
||||
}
|
||||
return elems
|
||||
}
|
||||
|
||||
// LoadURL loads the HTML document from the specified URL.
|
||||
func LoadURL(url string) (*html.Node, error) {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return html.Parse(r)
|
||||
}
|
||||
|
||||
// LoadDoc loads the HTML document from the specified file path.
|
||||
func LoadDoc(path string) (*html.Node, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return html.Parse(bufio.NewReader(f))
|
||||
}
|
||||
|
||||
func getCurrentNode(n *NodeNavigator) *html.Node {
|
||||
if n.NodeType() == xpath.AttributeNode {
|
||||
childNode := &html.Node{
|
||||
Type: html.TextNode,
|
||||
Data: n.Value(),
|
||||
}
|
||||
return &html.Node{
|
||||
Type: html.ElementNode,
|
||||
Data: n.LocalName(),
|
||||
FirstChild: childNode,
|
||||
LastChild: childNode,
|
||||
}
|
||||
|
||||
}
|
||||
return n.curr
|
||||
}
|
||||
|
||||
// Parse returns the parse tree for the HTML from the given Reader.
|
||||
func Parse(r io.Reader) (*html.Node, error) {
|
||||
return html.Parse(r)
|
||||
}
|
||||
|
||||
// InnerText returns the text between the start and end tags of the object.
|
||||
func InnerText(n *html.Node) string {
|
||||
var output func(*bytes.Buffer, *html.Node)
|
||||
output = func(buf *bytes.Buffer, n *html.Node) {
|
||||
switch n.Type {
|
||||
case html.TextNode:
|
||||
buf.WriteString(n.Data)
|
||||
return
|
||||
case html.CommentNode:
|
||||
return
|
||||
}
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
output(buf, child)
|
||||
}
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
output(&buf, n)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// SelectAttr returns the attribute value with the specified name.
|
||||
func SelectAttr(n *html.Node, name string) (val string) {
|
||||
if n == nil {
|
||||
return
|
||||
}
|
||||
if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
|
||||
return InnerText(n)
|
||||
}
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Key == name {
|
||||
val = attr.Val
|
||||
break
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// OutputHTML returns the text including tags name.
|
||||
func OutputHTML(n *html.Node, self bool) string {
|
||||
var buf bytes.Buffer
|
||||
if self {
|
||||
html.Render(&buf, n)
|
||||
} else {
|
||||
for n := n.FirstChild; n != nil; n = n.NextSibling {
|
||||
html.Render(&buf, n)
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
type NodeNavigator struct {
|
||||
root, curr *html.Node
|
||||
attr int
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) Current() *html.Node {
|
||||
return h.curr
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) NodeType() xpath.NodeType {
|
||||
switch h.curr.Type {
|
||||
case html.CommentNode:
|
||||
return xpath.CommentNode
|
||||
case html.TextNode:
|
||||
return xpath.TextNode
|
||||
case html.DocumentNode:
|
||||
return xpath.RootNode
|
||||
case html.ElementNode:
|
||||
if h.attr != -1 {
|
||||
return xpath.AttributeNode
|
||||
}
|
||||
return xpath.ElementNode
|
||||
case html.DoctypeNode:
|
||||
// ignored <!DOCTYPE HTML> declare and as Root-Node type.
|
||||
return xpath.RootNode
|
||||
}
|
||||
panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) LocalName() string {
|
||||
if h.attr != -1 {
|
||||
return h.curr.Attr[h.attr].Key
|
||||
}
|
||||
return h.curr.Data
|
||||
}
|
||||
|
||||
func (*NodeNavigator) Prefix() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) Value() string {
|
||||
switch h.curr.Type {
|
||||
case html.CommentNode:
|
||||
return h.curr.Data
|
||||
case html.ElementNode:
|
||||
if h.attr != -1 {
|
||||
return h.curr.Attr[h.attr].Val
|
||||
}
|
||||
return InnerText(h.curr)
|
||||
case html.TextNode:
|
||||
return h.curr.Data
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) Copy() xpath.NodeNavigator {
|
||||
n := *h
|
||||
return &n
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToRoot() {
|
||||
h.curr = h.root
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToParent() bool {
|
||||
if h.attr != -1 {
|
||||
h.attr = -1
|
||||
return true
|
||||
} else if node := h.curr.Parent; node != nil {
|
||||
h.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToNextAttribute() bool {
|
||||
if h.attr >= len(h.curr.Attr)-1 {
|
||||
return false
|
||||
}
|
||||
h.attr++
|
||||
return true
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToChild() bool {
|
||||
if h.attr != -1 {
|
||||
return false
|
||||
}
|
||||
if node := h.curr.FirstChild; node != nil {
|
||||
h.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToFirst() bool {
|
||||
if h.attr != -1 || h.curr.PrevSibling == nil {
|
||||
return false
|
||||
}
|
||||
for {
|
||||
node := h.curr.PrevSibling
|
||||
if node == nil {
|
||||
break
|
||||
}
|
||||
h.curr = node
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) String() string {
|
||||
return h.Value()
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToNext() bool {
|
||||
if h.attr != -1 {
|
||||
return false
|
||||
}
|
||||
if node := h.curr.NextSibling; node != nil {
|
||||
h.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveToPrevious() bool {
|
||||
if h.attr != -1 {
|
||||
return false
|
||||
}
|
||||
if node := h.curr.PrevSibling; node != nil {
|
||||
h.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
|
||||
node, ok := other.(*NodeNavigator)
|
||||
if !ok || node.root != h.root {
|
||||
return false
|
||||
}
|
||||
|
||||
h.curr = node.curr
|
||||
h.attr = node.attr
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user