Update xpath dependency (#507)

This commit is contained in:
WithoutPants
2020-04-30 08:32:33 +10:00
committed by GitHub
parent 3d22d5a742
commit 2166caf322
288 changed files with 34566 additions and 82048 deletions

View File

@@ -1,9 +1,9 @@
language: go
go:
- 1.6
- 1.7
- 1.8
- 1.9.x
- 1.12.x
- 1.13.x
install:
- go get golang.org/x/net/html/charset

View File

@@ -10,7 +10,7 @@ Overview
`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
`htmlquery` build-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. enable caching can avoid re-compile XPath expression each query.
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
Installation
====
@@ -101,7 +101,17 @@ Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It wi
Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
#### Disable caching feature
#### XPath query object cache performance
```
goos: windows
goarch: amd64
pkg: github.com/antchfx/htmlquery
BenchmarkSelectorCache-4 20000000 55.2 ns/op
BenchmarkDisableSelectorCache-4 500000 3162 ns/op
```
#### How to disable caching?
```
htmlquery.DisableSelectorCache = true

View File

@@ -3,9 +3,8 @@ package htmlquery
import (
"sync"
"github.com/golang/groupcache/lru"
"github.com/antchfx/xpath"
"github.com/golang/groupcache/lru"
)
// DisableSelectorCache will disable caching for the query selector if value is true.
@@ -16,8 +15,9 @@ var DisableSelectorCache = false
var SelectorCacheMaxEntries = 50
var (
cacheOnce sync.Once
cache *lru.Cache
cacheOnce sync.Once
cache *lru.Cache
cacheMutex sync.Mutex
)
func getQuery(expr string) (*xpath.Expr, error) {
@@ -25,8 +25,10 @@ func getQuery(expr string) (*xpath.Expr, error) {
return xpath.Compile(expr)
}
cacheOnce.Do(func() {
cache = lru.New(50)
cache = lru.New(SelectorCacheMaxEntries)
})
cacheMutex.Lock()
defer cacheMutex.Unlock()
if v, ok := cache.Get(expr); ok {
return v.(*xpath.Expr), nil
}

9
vendor/github.com/antchfx/htmlquery/go.mod generated vendored Normal file
View File

@@ -0,0 +1,9 @@
module github.com/antchfx/htmlquery
go 1.14
require (
github.com/antchfx/xpath v1.1.6
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
)

11
vendor/github.com/antchfx/htmlquery/go.sum generated vendored Normal file
View File

@@ -0,0 +1,11 @@
github.com/antchfx/xpath v1.1.6 h1:6sVh6hB5T6phw1pFpHRQ+C4bd8sNI+O58flqtg7h0R0=
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=

View File

@@ -144,6 +144,8 @@ Supported Features
`not()`| ✓ |
`number()`| ✓ |
`position()`| ✓ |
`replace()`| ✓ |
`reverse()`| ✓ |
`round()`| ✓ |
`starts-with()`| ✓ |
`string()`| ✓ |

View File

@@ -77,7 +77,18 @@ func (b *builder) processAxisNode(root *axisNode) (query, error) {
} else {
qyGrandInput = &contextQuery{}
}
qyOutput = &descendantQuery{Input: qyGrandInput, Predicate: predicate, Self: true}
// fix #20: https://github.com/antchfx/htmlquery/issues/20
filter := func(n NodeNavigator) bool {
v := predicate(n)
switch root.Prop {
case "text":
v = v && n.NodeType() == TextNode
case "comment":
v = v && n.NodeType() == CommentNode
}
return v
}
qyOutput = &descendantQuery{Input: qyGrandInput, Predicate: filter, Self: true}
return qyOutput, nil
}
}
@@ -243,6 +254,25 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
return nil, err
}
qyOutput = &functionQuery{Input: argQuery, Func: normalizespaceFunc}
case "replace":
//replace( string , string, string )
if len(root.Args) != 3 {
return nil, errors.New("xpath: replace function must have three parameters")
}
var (
arg1, arg2, arg3 query
err error
)
if arg1, err = b.processNode(root.Args[0]); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1]); err != nil {
return nil, err
}
if arg3, err = b.processNode(root.Args[2]); err != nil {
return nil, err
}
qyOutput = &functionQuery{Input: b.firstInput, Func: replaceFunc(arg1, arg2, arg3)}
case "translate":
//translate( string , string, string )
if len(root.Args) != 3 {
@@ -272,27 +302,27 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
}
qyOutput = &functionQuery{Input: argQuery, Func: notFunc}
case "name", "local-name", "namespace-uri":
inp := b.firstInput
if len(root.Args) > 1 {
return nil, fmt.Errorf("xpath: %s function must have at most one parameter", root.FuncName)
}
var (
arg query
err error
)
if len(root.Args) == 1 {
argQuery, err := b.processNode(root.Args[0])
arg, err = b.processNode(root.Args[0])
if err != nil {
return nil, err
}
inp = argQuery
}
f := &functionQuery{Input: inp}
switch root.FuncName {
case "name":
f.Func = nameFunc
qyOutput = &functionQuery{Input: b.firstInput, Func: nameFunc(arg)}
case "local-name":
f.Func = localNameFunc
qyOutput = &functionQuery{Input: b.firstInput, Func: localNameFunc(arg)}
case "namespace-uri":
f.Func = namespaceFunc
qyOutput = &functionQuery{Input: b.firstInput, Func: namespaceFunc(arg)}
}
qyOutput = f
case "true", "false":
val := root.FuncName == "true"
qyOutput = &functionQuery{
@@ -379,6 +409,15 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
args = append(args, q)
}
qyOutput = &functionQuery{Input: b.firstInput, Func: concatFunc(args...)}
case "reverse":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: reverse(node-sets) function must with have parameters node-sets")
}
argQuery, err := b.processNode(root.Args[0])
if err != nil {
return nil, err
}
qyOutput = &transformFunctionQuery{Input: argQuery, Func: reverseFunc}
default:
return nil, fmt.Errorf("not yet support this function %s()", root.FuncName)
}

View File

@@ -134,42 +134,64 @@ func roundFunc(q query, t iterator) interface{} {
}
// nameFunc is a XPath functions name([node-set]).
func nameFunc(q query, t iterator) interface{} {
v := q.Select(t)
if v == nil {
return ""
func nameFunc(arg query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
v = arg.Select(t)
if v == nil {
return ""
}
}
ns := v.Prefix()
if ns == "" {
return v.LocalName()
}
return ns + ":" + v.LocalName()
}
ns := v.Prefix()
if ns == "" {
return v.LocalName()
}
return ns + ":" + v.LocalName()
}
// localNameFunc is a XPath functions local-name([node-set]).
func localNameFunc(q query, t iterator) interface{} {
v := q.Select(t)
if v == nil {
return ""
func localNameFunc(arg query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
v = arg.Select(t)
if v == nil {
return ""
}
}
return v.LocalName()
}
return v.LocalName()
}
// namespaceFunc is a XPath functions namespace-uri([node-set]).
func namespaceFunc(q query, t iterator) interface{} {
v := q.Select(t)
if v == nil {
return ""
func namespaceFunc(arg query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
// Get the first node in the node-set if specified.
v = arg.Select(t)
if v == nil {
return ""
}
}
// fix about namespace-uri() bug: https://github.com/antchfx/xmlquery/issues/22
// TODO: In the next version, add NamespaceURL() to the NodeNavigator interface.
type namespaceURL interface {
NamespaceURL() string
}
if f, ok := v.(namespaceURL); ok {
return f.NamespaceURL()
}
return v.Prefix()
}
// fix about namespace-uri() bug: https://github.com/antchfx/xmlquery/issues/22
// TODO: In the next version, add NamespaceURL() to the NodeNavigator interface.
type namespaceURL interface {
NamespaceURL() string
}
if f, ok := v.(namespaceURL); ok {
return f.NamespaceURL()
}
return v.Prefix()
}
func asBool(t iterator, v interface{}) bool {
@@ -240,7 +262,7 @@ func startwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
m, n string
ok bool
)
switch typ := arg1.Evaluate(t).(type) {
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
@@ -252,7 +274,7 @@ func startwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
default:
panic(errors.New("starts-with() function argument type must be string"))
}
n, ok = arg2.Evaluate(t).(string)
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("starts-with() function argument type must be string"))
}
@@ -267,7 +289,7 @@ func endwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
m, n string
ok bool
)
switch typ := arg1.Evaluate(t).(type) {
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
@@ -279,7 +301,7 @@ func endwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
default:
panic(errors.New("ends-with() function argument type must be string"))
}
n, ok = arg2.Evaluate(t).(string)
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("ends-with() function argument type must be string"))
}
@@ -294,8 +316,7 @@ func containsFunc(arg1, arg2 query) func(query, iterator) interface{} {
m, n string
ok bool
)
switch typ := arg1.Evaluate(t).(type) {
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
@@ -308,7 +329,7 @@ func containsFunc(arg1, arg2 query) func(query, iterator) interface{} {
panic(errors.New("contains() function argument type must be string"))
}
n, ok = arg2.Evaluate(t).(string)
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("contains() function argument type must be string"))
}
@@ -345,7 +366,7 @@ func normalizespaceFunc(q query, t iterator) interface{} {
func substringFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var m string
switch typ := arg1.Evaluate(t).(type) {
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
@@ -359,14 +380,14 @@ func substringFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
var start, length float64
var ok bool
if start, ok = arg2.Evaluate(t).(float64); !ok {
if start, ok = functionArgs(arg2).Evaluate(t).(float64); !ok {
panic(errors.New("substring() function first argument type must be int"))
} else if start < 1 {
panic(errors.New("substring() function first argument type must be >= 1"))
}
start--
if arg3 != nil {
if length, ok = arg3.Evaluate(t).(float64); !ok {
if length, ok = functionArgs(arg3).Evaluate(t).(float64); !ok {
panic(errors.New("substring() function second argument type must be int"))
}
}
@@ -384,7 +405,7 @@ func substringFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
func substringIndFunc(arg1, arg2 query, after bool) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var str string
switch v := arg1.Evaluate(t).(type) {
switch v := functionArgs(arg1).Evaluate(t).(type) {
case string:
str = v
case query:
@@ -395,7 +416,7 @@ func substringIndFunc(arg1, arg2 query, after bool) func(query, iterator) interf
str = node.Value()
}
var word string
switch v := arg2.Evaluate(t).(type) {
switch v := functionArgs(arg2).Evaluate(t).(type) {
case string:
word = v
case query:
@@ -424,7 +445,7 @@ func substringIndFunc(arg1, arg2 query, after bool) func(query, iterator) interf
// equal to the number of characters in a given string.
func stringLengthFunc(arg1 query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
switch v := arg1.Evaluate(t).(type) {
switch v := functionArgs(arg1).Evaluate(t).(type) {
case string:
return float64(len(v))
case query:
@@ -441,9 +462,9 @@ func stringLengthFunc(arg1 query) func(query, iterator) interface{} {
// translateFunc is XPath functions translate() function returns a replaced string.
func translateFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
str := asString(t, arg1.Evaluate(t))
src := asString(t, arg2.Evaluate(t))
dst := asString(t, arg3.Evaluate(t))
str := asString(t, functionArgs(arg1).Evaluate(t))
src := asString(t, functionArgs(arg2).Evaluate(t))
dst := asString(t, functionArgs(arg3).Evaluate(t))
var replace []string
for i, s := range src {
@@ -457,6 +478,17 @@ func translateFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
}
}
// replaceFunc is XPath functions replace() function returns a replaced string.
func replaceFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
str := asString(t, functionArgs(arg1).Evaluate(t))
src := asString(t, functionArgs(arg2).Evaluate(t))
dst := asString(t, functionArgs(arg3).Evaluate(t))
return strings.Replace(str, src, dst, -1)
}
}
// notFunc is XPATH functions not(expression) function operation.
func notFunc(q query, t iterator) interface{} {
switch v := q.Evaluate(t).(type) {
@@ -477,6 +509,7 @@ func concatFunc(args ...query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var a []string
for _, v := range args {
v = functionArgs(v)
switch v := v.Evaluate(t).(type) {
case string:
a = append(a, v)
@@ -490,3 +523,31 @@ func concatFunc(args ...query) func(query, iterator) interface{} {
return strings.Join(a, "")
}
}
// https://github.com/antchfx/xpath/issues/43
func functionArgs(q query) query {
if _, ok := q.(*functionQuery); ok {
return q
}
return q.Clone()
}
func reverseFunc(q query, t iterator) func() NodeNavigator {
var list []NodeNavigator
for {
node := q.Select(t)
if node == nil {
break
}
list = append(list, node.Copy())
}
i := len(list)
return func() NodeNavigator {
if i <= 0 {
return nil
}
i--
node := list[i]
return node
}
}

View File

@@ -163,7 +163,17 @@ func cmpNodeSetString(t iterator, op string, m, n interface{}) bool {
}
func cmpNodeSetNodeSet(t iterator, op string, m, n interface{}) bool {
return false
a := m.(query)
b := n.(query)
x := a.Select(t)
if x == nil {
return false
}
y := b.Select(t)
if y == nil {
return false
}
return cmpStringStringF(op,x.Value(),y.Value())
}
func cmpStringNumeric(t iterator, op string, m, n interface{}) bool {

View File

@@ -227,6 +227,7 @@ func (c *childQuery) position() int {
type descendantQuery struct {
iterator func() NodeNavigator
posit int
level int
Self bool
Input query
@@ -242,7 +243,7 @@ func (d *descendantQuery) Select(t iterator) NodeNavigator {
return nil
}
node = node.Copy()
level := 0
d.level = 0
positmap := make(map[int]int)
first := true
d.iterator = func() NodeNavigator {
@@ -250,30 +251,30 @@ func (d *descendantQuery) Select(t iterator) NodeNavigator {
first = false
if d.Predicate(node) {
d.posit = 1
positmap[level] = 1
positmap[d.level] = 1
return node
}
}
for {
if node.MoveToChild() {
level++
positmap[level] = 0
d.level = d.level + 1
positmap[d.level] = 0
} else {
for {
if level == 0 {
if d.level == 0 {
return nil
}
if node.MoveToNext() {
break
}
node.MoveToParent()
level--
d.level = d.level - 1
}
}
if d.Predicate(node) {
positmap[level]++
d.posit = positmap[level]
positmap[d.level]++
d.posit = positmap[d.level]
return node
}
}
@@ -302,6 +303,10 @@ func (d *descendantQuery) position() int {
return d.posit
}
func (d *descendantQuery) depth() int {
return d.level
}
func (d *descendantQuery) Clone() query {
return &descendantQuery{Self: d.Self, Input: d.Input.Clone(), Predicate: d.Predicate}
}
@@ -538,6 +543,7 @@ type filterQuery struct {
Input query
Predicate query
posit int
positmap map[int]int
}
func (f *filterQuery) do(t iterator) bool {
@@ -563,7 +569,9 @@ func (f *filterQuery) position() int {
}
func (f *filterQuery) Select(t iterator) NodeNavigator {
if f.positmap == nil {
f.positmap = make(map[int]int)
}
for {
node := f.Input.Select(t)
@@ -574,10 +582,13 @@ func (f *filterQuery) Select(t iterator) NodeNavigator {
t.Current().MoveTo(node)
if f.do(t) {
f.posit++
// fix https://github.com/antchfx/htmlquery/issues/26
// Calculate and keep the each of matching node's position in the same depth.
level := getNodeDepth(f.Input)
f.positmap[level]++
f.posit = f.positmap[level]
return node
}
f.posit = 0
}
}
@@ -590,8 +601,9 @@ func (f *filterQuery) Clone() query {
return &filterQuery{Input: f.Input.Clone(), Predicate: f.Predicate.Clone()}
}
// functionQuery is an XPath function that call a function to returns
// value of current NodeNavigator node.
// functionQuery is an XPath function that returns a computed value for
// the Evaluate call of the current NodeNavigator node. Select call isn't
// applicable for functionQuery.
type functionQuery struct {
Input query // Node Set
Func func(query, iterator) interface{} // The xpath function.
@@ -611,6 +623,34 @@ func (f *functionQuery) Clone() query {
return &functionQuery{Input: f.Input.Clone(), Func: f.Func}
}
// transformFunctionQuery diffs from functionQuery where the latter computes a scalar
// value (number,string,boolean) for the current NodeNavigator node while the former
// (transformFunctionQuery) performs a mapping or transform of the current NodeNavigator
// and returns a new NodeNavigator. It is used for non-scalar XPath functions such as
// reverse(), remove(), subsequence(), unordered(), etc.
type transformFunctionQuery struct {
Input query
Func func(query, iterator) func() NodeNavigator
iterator func() NodeNavigator
}
func (f *transformFunctionQuery) Select(t iterator) NodeNavigator {
if f.iterator == nil {
f.iterator = f.Func(f.Input, t)
}
return f.iterator()
}
func (f *transformFunctionQuery) Evaluate(t iterator) interface{} {
f.Input.Evaluate(t)
f.iterator = nil
return f
}
func (f *transformFunctionQuery) Clone() query {
return &transformFunctionQuery{Input: f.Input.Clone(), Func: f.Func}
}
// constantQuery is an XPath constant operand.
type constantQuery struct {
Val interface{}
@@ -827,8 +867,18 @@ func getHashCode(n NodeNavigator) uint64 {
switch n.NodeType() {
case AttributeNode, TextNode, CommentNode:
sb.WriteString(fmt.Sprintf("%s=%s", n.LocalName(), n.Value()))
if n.MoveToParent() {
sb.WriteString(n.LocalName())
// https://github.com/antchfx/htmlquery/issues/25
d := 1
for n.MoveToPrevious() {
d++
}
sb.WriteString(fmt.Sprintf("-%d", d))
for n.MoveToParent() {
d = 1
for n.MoveToPrevious() {
d++
}
sb.WriteString(fmt.Sprintf("-%d", d))
}
case ElementNode:
sb.WriteString(n.Prefix() + n.LocalName())
@@ -860,3 +910,13 @@ func getNodePosition(q query) int {
}
return 1
}
func getNodeDepth(q query) int {
type Depth interface {
depth() int
}
if count, ok := q.(Depth); ok {
return count.depth()
}
return 0
}

View File

@@ -2,6 +2,7 @@ package xpath
import (
"errors"
"fmt"
)
// NodeType represents a type of XPath node.
@@ -144,6 +145,9 @@ func Compile(expr string) (*Expr, error) {
if err != nil {
return nil, err
}
if qy == nil {
return nil, fmt.Errorf(fmt.Sprintf("undeclared variable in XPath expression: %s", expr))
}
return &Expr{s: expr, q: qy}, nil
}