Skip cleaning for search by name scrape queries (#2059)

* Skip pp for search by name queries
* upgrade htmlquery
This commit is contained in:
bnkai
2021-12-16 02:18:39 +02:00
committed by GitHub
parent 439c338049
commit 66dd239732
34 changed files with 10925 additions and 10665 deletions

View File

@@ -12,6 +12,16 @@ Overview
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
XPath query packages for Go
===
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Installation
====
@@ -60,15 +70,15 @@ list := htmlquery.Find(doc, "//a")
#### Find all A elements that have `href` attribute.
```go
list := range htmlquery.Find(doc, "//a[@href]")
list := htmlquery.Find(doc, "//a[@href]")
```
#### Find all A elements with `href` attribute and only return `href` value.
```go
list := range htmlquery.Find(doc, "//a/@href")
for n := range list{
fmt.Println(htmlquery.InnerText(n)) // output @href value without A element.
list := htmlquery.Find(doc, "//a/@href")
for _ , n := range list{
fmt.Println(htmlquery.SelectAttr(n, "href")) // output @href value
}
```
@@ -78,6 +88,13 @@ for n := range list{
a := htmlquery.FindOne(doc, "//a[3]")
```
### Find children element (img) under A `href` and print the source
```go
a := htmlquery.FindOne(doc, "//a")
img := htmlquery.FindOne(a, "//img")
fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
```
#### Evaluate the number of all IMG element.
```go
@@ -87,6 +104,30 @@ fmt.Printf("total count is %f", v)
```
Quick Starts
===
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
if a != nil {
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
}
```
FAQ
====
@@ -117,52 +158,6 @@ BenchmarkDisableSelectorCache-4 500000 3162 ns/op
htmlquery.DisableSelectorCache = true
```
Changelogs
===
2019-11-19
- Add built-in query object cache feature, avoid re-compilation for the same query string. [#16](https://github.com/antchfx/htmlquery/issues/16)
- Added LoadDoc [18](https://github.com/antchfx/htmlquery/pull/18)
2019-10-05
- Add new methods that compatible with invalid XPath expression error: `QueryAll` and `Query`.
- Add `QuerySelector` and `QuerySelectorAll` methods, supported reused your query object.
2019-02-04
- [#7](https://github.com/antchfx/htmlquery/issues/7) Removed deprecated `FindEach()` and `FindEachWithBreak()` methods.
2018-12-28
- Avoid adding duplicate elements to list for `Find()` method. [#6](https://github.com/antchfx/htmlquery/issues/6)
Tutorial
===
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
```
List of supported XPath query packages
===
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Questions
===
Please let me know if you have any questions.

View File

@@ -55,10 +55,10 @@ func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
return nodes, nil
}
// Query searches the html.Node that matches by the specified XPath expr,
// and return the first element of matched html.Node.
// Query runs the given XPath expression against the given html.Node and
// returns the first matching html.Node, or nil if no matches are found.
//
// Return an error if the expression `expr` cannot be parsed.
// Returns an error if the expression `expr` cannot be parsed.
func Query(top *html.Node, expr string) (*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
@@ -83,11 +83,6 @@ func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
for t.MoveNext() {
nav := t.Current().(*NodeNavigator)
n := getCurrentNode(nav)
// avoid adding duplicate nodes.
if len(elems) > 0 && (elems[0] == n || (nav.NodeType() == xpath.AttributeNode &&
nav.LocalName() == elems[0].Data && nav.Value() == InnerText(elems[0]))) {
continue
}
elems = append(elems, n)
}
return elems
@@ -179,6 +174,19 @@ func SelectAttr(n *html.Node, name string) (val string) {
return
}
// ExistsAttr returns whether attribute with specified name exists.
func ExistsAttr(n *html.Node, name string) bool {
if n == nil {
return false
}
for _, attr := range n.Attr {
if attr.Key == name {
return true
}
}
return false
}
// OutputHTML returns the text including tags name.
func OutputHTML(n *html.Node, self bool) string {
var buf bytes.Buffer