Skip cleaning for search by name scrape queries (#2059)

* Skip pp for search by name queries
* upgrade htmlquery
This commit is contained in:
bnkai
2021-12-16 02:18:39 +02:00
committed by GitHub
parent 439c338049
commit 66dd239732
34 changed files with 10925 additions and 10665 deletions

View File

@@ -12,6 +12,16 @@ Overview
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
XPath query packages for Go
===
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Installation
====
@@ -60,15 +70,15 @@ list := htmlquery.Find(doc, "//a")
#### Find all A elements that have `href` attribute.
```go
list := range htmlquery.Find(doc, "//a[@href]")
list := htmlquery.Find(doc, "//a[@href]")
```
#### Find all A elements with `href` attribute and only return `href` value.
```go
list := range htmlquery.Find(doc, "//a/@href")
for n := range list{
fmt.Println(htmlquery.InnerText(n)) // output @href value without A element.
list := htmlquery.Find(doc, "//a/@href")
for _ , n := range list{
fmt.Println(htmlquery.SelectAttr(n, "href")) // output @href value
}
```
@@ -78,6 +88,13 @@ for n := range list{
a := htmlquery.FindOne(doc, "//a[3]")
```
### Find children element (img) under A `href` and print the source
```go
a := htmlquery.FindOne(doc, "//a")
img := htmlquery.FindOne(a, "//img")
fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
```
#### Evaluate the number of all IMG element.
```go
@@ -87,6 +104,30 @@ fmt.Printf("total count is %f", v)
```
Quick Starts
===
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
if a != nil {
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
}
```
FAQ
====
@@ -117,52 +158,6 @@ BenchmarkDisableSelectorCache-4 500000 3162 ns/op
htmlquery.DisableSelectorCache = true
```
Changelogs
===
2019-11-19
- Add built-in query object cache feature, avoid re-compilation for the same query string. [#16](https://github.com/antchfx/htmlquery/issues/16)
- Added LoadDoc [18](https://github.com/antchfx/htmlquery/pull/18)
2019-10-05
- Add new methods that compatible with invalid XPath expression error: `QueryAll` and `Query`.
- Add `QuerySelector` and `QuerySelectorAll` methods, supported reused your query object.
2019-02-04
- [#7](https://github.com/antchfx/htmlquery/issues/7) Removed deprecated `FindEach()` and `FindEachWithBreak()` methods.
2018-12-28
- Avoid adding duplicate elements to list for `Find()` method. [#6](https://github.com/antchfx/htmlquery/issues/6)
Tutorial
===
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
```
List of supported XPath query packages
===
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Questions
===
Please let me know if you have any questions.

View File

@@ -55,10 +55,10 @@ func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
return nodes, nil
}
// Query searches the html.Node that matches by the specified XPath expr,
// and return the first element of matched html.Node.
// Query runs the given XPath expression against the given html.Node and
// returns the first matching html.Node, or nil if no matches are found.
//
// Return an error if the expression `expr` cannot be parsed.
// Returns an error if the expression `expr` cannot be parsed.
func Query(top *html.Node, expr string) (*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
@@ -83,11 +83,6 @@ func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
for t.MoveNext() {
nav := t.Current().(*NodeNavigator)
n := getCurrentNode(nav)
// avoid adding duplicate nodes.
if len(elems) > 0 && (elems[0] == n || (nav.NodeType() == xpath.AttributeNode &&
nav.LocalName() == elems[0].Data && nav.Value() == InnerText(elems[0]))) {
continue
}
elems = append(elems, n)
}
return elems
@@ -179,6 +174,19 @@ func SelectAttr(n *html.Node, name string) (val string) {
return
}
// ExistsAttr returns whether attribute with specified name exists.
func ExistsAttr(n *html.Node, name string) bool {
if n == nil {
return false
}
for _, attr := range n.Attr {
if attr.Key == name {
return true
}
}
return false
}
// OutputHTML returns the text including tags name.
func OutputHTML(n *html.Node, self bool) string {
var buf bytes.Buffer

View File

@@ -138,6 +138,7 @@ Supported Features
`lang()`| ✗ |
`last()`| ✓ |
`local-name()`| ✓ |
`matches()`| ✓ |
`name()`| ✓ |
`namespace-uri()`| ✓ |
`normalize-space()`| ✓ |

View File

@@ -193,8 +193,23 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) {
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Input: b.firstInput, Func: containsFunc(arg1, arg2)}
case "matches":
//matches(string , pattern)
if len(root.Args) != 2 {
return nil, errors.New("xpath: matches function must have two parameters")
}
var (
arg1, arg2 query
err error
)
if arg1, err = b.processNode(root.Args[0]); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1]); err != nil {
return nil, err
}
qyOutput = &functionQuery{Input: b.firstInput, Func: matchesFunc(arg1, arg2)}
case "substring":
//substring( string , start [, length] )
if len(root.Args) < 2 {
@@ -435,13 +450,15 @@ func (b *builder) processOperatorNode(root *operatorNode) (query, error) {
}
var qyOutput query
switch root.Op {
case "+", "-", "div", "mod": // Numeric operator
case "+", "-", "*", "div", "mod": // Numeric operator
var exprFunc func(interface{}, interface{}) interface{}
switch root.Op {
case "+":
exprFunc = plusFunc
case "-":
exprFunc = minusFunc
case "*":
exprFunc = mulFunc
case "div":
exprFunc = divFunc
case "mod":
@@ -498,6 +515,12 @@ func (b *builder) processNode(root node) (q query, err error) {
q, err = b.processFunctionNode(root.(*functionNode))
case nodeOperator:
q, err = b.processOperatorNode(root.(*operatorNode))
case nodeGroup:
q, err = b.processNode(root.(*groupNode).Input)
if err != nil {
return
}
q = &groupQuery{Input: q}
}
return
}

80
vendor/github.com/antchfx/xpath/cache.go generated vendored Normal file
View File

@@ -0,0 +1,80 @@
package xpath
import (
"regexp"
"sync"
)
type loadFunc func(key interface{}) (interface{}, error)
const (
defaultCap = 65536
)
// The reason we're building a simple capacity-resetting loading cache (when capacity reached) instead of using
// something like github.com/hashicorp/golang-lru is primarily due to (not wanting to create) external dependency.
// Currently this library has 0 external dep (other than go sdk), and supports go 1.6, 1.9, and 1.10 (and later).
// Creating external lib dependencies (plus their transitive dependencies) would make things hard if not impossible.
// We expect under most circumstances, the defaultCap is big enough for any long running services that use this
// library if their xpath regexp cardinality is low. However, in extreme cases when the capacity is reached, we
// simply reset the cache, taking a small subsequent perf hit (next to nothing considering amortization) in trade
// of more complex and less performant LRU type of construct.
type loadingCache struct {
sync.RWMutex
cap int
load loadFunc
m map[interface{}]interface{}
reset int
}
// NewLoadingCache creates a new instance of a loading cache with capacity. Capacity must be >= 0, or
// it will panic. Capacity == 0 means the cache growth is unbounded.
func NewLoadingCache(load loadFunc, capacity int) *loadingCache {
if capacity < 0 {
panic("capacity must be >= 0")
}
return &loadingCache{cap: capacity, load: load, m: make(map[interface{}]interface{})}
}
func (c *loadingCache) get(key interface{}) (interface{}, error) {
c.RLock()
v, found := c.m[key]
c.RUnlock()
if found {
return v, nil
}
v, err := c.load(key)
if err != nil {
return nil, err
}
c.Lock()
if c.cap > 0 && len(c.m) >= c.cap {
c.m = map[interface{}]interface{}{key: v}
c.reset++
} else {
c.m[key] = v
}
c.Unlock()
return v, nil
}
var (
// RegexpCache is a loading cache for string -> *regexp.Regexp mapping. It is exported so that in rare cases
// client can customize load func and/or capacity.
RegexpCache = defaultRegexpCache()
)
func defaultRegexpCache() *loadingCache {
return NewLoadingCache(
func(key interface{}) (interface{}, error) {
return regexp.Compile(key.(string))
}, defaultCap)
}
func getRegexp(pattern string) (*regexp.Regexp, error) {
exp, err := RegexpCache.get(pattern)
if err != nil {
return nil, err
}
return exp.(*regexp.Regexp), nil
}

View File

@@ -4,11 +4,26 @@ import (
"errors"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"sync"
"unicode"
)
// Defined an interface of stringBuilder that compatible with
// strings.Builder(go 1.10) and bytes.Buffer(< go 1.10)
type stringBuilder interface {
WriteRune(r rune) (n int, err error)
WriteString(s string) (int, error)
Reset()
Grow(n int)
String() string
}
var builderPool = sync.Pool{New: func() interface{} {
return newStringBuilder()
}}
// The XPath function list.
func predicate(q query) func(NodeNavigator) bool {
@@ -25,7 +40,7 @@ func predicate(q query) func(NodeNavigator) bool {
func positionFunc(q query, t iterator) interface{} {
var (
count = 1
node = t.Current()
node = t.Current().Copy()
)
test := predicate(q)
for node.MoveToPrevious() {
@@ -40,7 +55,7 @@ func positionFunc(q query, t iterator) interface{} {
func lastFunc(q query, t iterator) interface{} {
var (
count = 0
node = t.Current()
node = t.Current().Copy()
)
node.MoveToFirst()
test := predicate(q)
@@ -58,6 +73,7 @@ func lastFunc(q query, t iterator) interface{} {
// countFunc is a XPath Node Set functions count(node-set).
func countFunc(q query, t iterator) interface{} {
var count = 0
q = functionArgs(q)
test := predicate(q)
switch typ := q.Evaluate(t).(type) {
case query:
@@ -73,7 +89,7 @@ func countFunc(q query, t iterator) interface{} {
// sumFunc is a XPath Node Set functions sum(node-set).
func sumFunc(q query, t iterator) interface{} {
var sum float64
switch typ := q.Evaluate(t).(type) {
switch typ := functionArgs(q).Evaluate(t).(type) {
case query:
for node := typ.Select(t); node != nil; node = typ.Select(t) {
if v, err := strconv.ParseFloat(node.Value(), 64); err == nil {
@@ -116,19 +132,19 @@ func asNumber(t iterator, o interface{}) float64 {
// ceilingFunc is a XPath Node Set functions ceiling(node-set).
func ceilingFunc(q query, t iterator) interface{} {
val := asNumber(t, q.Evaluate(t))
val := asNumber(t, functionArgs(q).Evaluate(t))
return math.Ceil(val)
}
// floorFunc is a XPath Node Set functions floor(node-set).
func floorFunc(q query, t iterator) interface{} {
val := asNumber(t, q.Evaluate(t))
val := asNumber(t, functionArgs(q).Evaluate(t))
return math.Floor(val)
}
// roundFunc is a XPath Node Set functions round(node-set).
func roundFunc(q query, t iterator) interface{} {
val := asNumber(t, q.Evaluate(t))
val := asNumber(t, functionArgs(q).Evaluate(t))
//return math.Round(val)
return round(val)
}
@@ -140,7 +156,7 @@ func nameFunc(arg query) func(query, iterator) interface{} {
if arg == nil {
v = t.Current()
} else {
v = arg.Select(t)
v = arg.Clone().Select(t)
if v == nil {
return ""
}
@@ -160,7 +176,7 @@ func localNameFunc(arg query) func(query, iterator) interface{} {
if arg == nil {
v = t.Current()
} else {
v = arg.Select(t)
v = arg.Clone().Select(t)
if v == nil {
return ""
}
@@ -177,7 +193,7 @@ func namespaceFunc(arg query) func(query, iterator) interface{} {
v = t.Current()
} else {
// Get the first node in the node-set if specified.
v = arg.Select(t)
v = arg.Clone().Select(t)
if v == nil {
return ""
}
@@ -201,7 +217,7 @@ func asBool(t iterator, v interface{}) bool {
case *NodeIterator:
return v.MoveNext()
case bool:
return bool(v)
return v
case float64:
return v != 0
case string:
@@ -239,19 +255,19 @@ func asString(t iterator, v interface{}) string {
// booleanFunc is a XPath functions boolean([node-set]).
func booleanFunc(q query, t iterator) interface{} {
v := q.Evaluate(t)
v := functionArgs(q).Evaluate(t)
return asBool(t, v)
}
// numberFunc is a XPath functions number([node-set]).
func numberFunc(q query, t iterator) interface{} {
v := q.Evaluate(t)
v := functionArgs(q).Evaluate(t)
return asNumber(t, v)
}
// stringFunc is a XPath functions string([node-set]).
func stringFunc(q query, t iterator) interface{} {
v := q.Evaluate(t)
v := functionArgs(q).Evaluate(t)
return asString(t, v)
}
@@ -338,15 +354,39 @@ func containsFunc(arg1, arg2 query) func(query, iterator) interface{} {
}
}
var (
regnewline = regexp.MustCompile(`[\r\n\t]`)
regseqspace = regexp.MustCompile(`\s{2,}`)
)
// matchesFunc is an XPath function that tests a given string against a regexp pattern.
// Note: does not support https://www.w3.org/TR/xpath-functions-31/#func-matches 3rd optional `flags` argument; if
// needed, directly put flags in the regexp pattern, such as `(?i)^pattern$` for `i` flag.
func matchesFunc(arg1, arg2 query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var s string
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
s = typ
case query:
node := typ.Select(t)
if node == nil {
return ""
}
s = node.Value()
}
var pattern string
var ok bool
if pattern, ok = functionArgs(arg2).Evaluate(t).(string); !ok {
panic(errors.New("matches() function second argument type must be string"))
}
re, err := getRegexp(pattern)
if err != nil {
panic(fmt.Errorf("matches() function second argument is not a valid regexp pattern, err: %s", err.Error()))
}
return re.MatchString(s)
}
}
// normalizespaceFunc is XPath functions normalize-space(string?)
func normalizespaceFunc(q query, t iterator) interface{} {
var m string
switch typ := q.Evaluate(t).(type) {
switch typ := functionArgs(q).Evaluate(t).(type) {
case string:
m = typ
case query:
@@ -356,10 +396,26 @@ func normalizespaceFunc(q query, t iterator) interface{} {
}
m = node.Value()
}
m = strings.TrimSpace(m)
m = regnewline.ReplaceAllString(m, " ")
m = regseqspace.ReplaceAllString(m, " ")
return m
var b = builderPool.Get().(stringBuilder)
b.Grow(len(m))
runeStr := []rune(strings.TrimSpace(m))
l := len(runeStr)
for i := range runeStr {
r := runeStr[i]
isSpace := unicode.IsSpace(r)
if !(isSpace && (i+1 < l && unicode.IsSpace(runeStr[i+1]))) {
if isSpace {
r = ' '
}
b.WriteRune(r)
}
}
result := b.String()
b.Reset()
builderPool.Put(b)
return result
}
// substringFunc is XPath functions substring function returns a part of a given string.
@@ -466,7 +522,7 @@ func translateFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
src := asString(t, functionArgs(arg2).Evaluate(t))
dst := asString(t, functionArgs(arg3).Evaluate(t))
var replace []string
replace := make([]string, 0, len(src))
for i, s := range src {
d := ""
if i < len(dst) {
@@ -491,7 +547,7 @@ func replaceFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
// notFunc is XPATH functions not(expression) function operation.
func notFunc(q query, t iterator) interface{} {
switch v := q.Evaluate(t).(type) {
switch v := functionArgs(q).Evaluate(t).(type) {
case bool:
return !v
case query:
@@ -507,20 +563,25 @@ func notFunc(q query, t iterator) interface{} {
// concat( string1 , string2 [, stringn]* )
func concatFunc(args ...query) func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var a []string
b := builderPool.Get().(stringBuilder)
for _, v := range args {
v = functionArgs(v)
switch v := v.Evaluate(t).(type) {
case string:
a = append(a, v)
b.WriteString(v)
case query:
node := v.Select(t)
if node != nil {
a = append(a, node.Value())
b.WriteString(node.Value())
}
}
}
return strings.Join(a, "")
result := b.String()
b.Reset()
builderPool.Put(b)
return result
}
}

View File

@@ -2,8 +2,15 @@
package xpath
import "math"
import (
"math"
"strings"
)
func round(f float64) int {
return int(math.Round(f))
}
func newStringBuilder() stringBuilder{
return &strings.Builder{}
}

View File

@@ -2,7 +2,10 @@
package xpath
import "math"
import (
"bytes"
"math"
)
// math.Round() is supported by Go 1.10+,
// This method just compatible for version <1.10.
@@ -13,3 +16,7 @@ func round(f float64) int {
}
return int(f + math.Copysign(0.5, f))
}
func newStringBuilder() stringBuilder {
return &bytes.Buffer{}
}

View File

@@ -173,7 +173,7 @@ func cmpNodeSetNodeSet(t iterator, op string, m, n interface{}) bool {
if y == nil {
return false
}
return cmpStringStringF(op,x.Value(),y.Value())
return cmpStringStringF(op, x.Value(), y.Value())
}
func cmpStringNumeric(t iterator, op string, m, n interface{}) bool {

View File

@@ -65,6 +65,7 @@ const (
nodeOperator
nodeVariable
nodeConstantOperand
nodeGroup
)
type parser struct {
@@ -104,6 +105,10 @@ func newFilterNode(n, m node) node {
return &filterNode{nodeType: nodeFilter, Input: n, Condition: m}
}
func newGroupNode(n node) node {
return &groupNode{nodeType: nodeGroup, Input: n}
}
// newRootNode returns a root node.
func newRootNode(s string) node {
return &rootNode{nodeType: nodeRoot, slash: s}
@@ -492,6 +497,9 @@ func (p *parser) parsePrimaryExpr(n node) (opnd node) {
case itemLParens:
p.next()
opnd = p.parseExpression(n)
if opnd.Type() != nodeConstantOperand {
opnd = newGroupNode(opnd)
}
p.skipItem(itemRParens)
case itemName:
if p.r.canBeFunc && !isNodeType(p.r) {
@@ -587,6 +595,16 @@ func (o *operandNode) String() string {
return fmt.Sprintf("%v", o.Val)
}
// groupNode holds a set of node expression
type groupNode struct {
nodeType
Input node
}
func (g *groupNode) String() string {
return fmt.Sprintf("%s", g.Input)
}
// filterNode holds a condition filter.
type filterNode struct {
nodeType

View File

@@ -76,6 +76,7 @@ func (a *ancestorQuery) Select(t iterator) NodeNavigator {
return nil
}
first := true
node = node.Copy()
a.iterator = func() NodeNavigator {
if first && a.Self {
first = false
@@ -668,6 +669,35 @@ func (c *constantQuery) Clone() query {
return c
}
type groupQuery struct {
posit int
Input query
}
func (g *groupQuery) Select(t iterator) NodeNavigator {
for {
node := g.Input.Select(t)
if node == nil {
return nil
}
g.posit++
return node.Copy()
}
}
func (g *groupQuery) Evaluate(t iterator) interface{} {
return g.Input.Evaluate(t)
}
func (g *groupQuery) Clone() query {
return &groupQuery{Input: g.Input}
}
func (g *groupQuery) position() int {
return g.posit
}
// logicalQuery is an XPath logical expression.
type logicalQuery struct {
Left, Right query