Skip cleaning for search by name scrape queries (#2059)

* Skip pp for search by name queries * upgrade htmlquery
2025-12-18 04:44:37 +03:00 · 2021-12-16 02:18:39 +02:00
parent 439c338049
commit 66dd239732
34 changed files with 10925 additions and 10665 deletions
--- a/vendor/github.com/antchfx/htmlquery/README.md
+++ b/vendor/github.com/antchfx/htmlquery/README.md
@@ -12,6 +12,16 @@ Overview

 `htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query. 

+You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
+
+XPath query packages for Go
+===
+| Name                                              | Description                               |
+| ------------------------------------------------- | ----------------------------------------- |
+| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
+| [xmlquery](https://github.com/antchfx/xmlquery)   | XPath query package for the XML document  |
+| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
+
 Installation
 ====

@@ -60,15 +70,15 @@ list := htmlquery.Find(doc, "//a")
 #### Find all A elements that have `href` attribute.

 ```go
-list := range htmlquery.Find(doc, "//a[@href]")	
+list := htmlquery.Find(doc, "//a[@href]")	
 ```

 #### Find all A elements with `href` attribute and only return `href` value.

 ```go
-list := range htmlquery.Find(doc, "//a/@href")	
-for n := range list{
-	fmt.Println(htmlquery.InnerText(n)) // output @href value without A element.
+list := htmlquery.Find(doc, "//a/@href")	
+for _ , n := range list{
+	fmt.Println(htmlquery.SelectAttr(n, "href")) // output @href value
 }
 ```

@@ -78,6 +88,13 @@ for n := range list{
 a := htmlquery.FindOne(doc, "//a[3]")
 ```

+### Find children element (img) under A `href` and print the source
+```go
+a := htmlquery.FindOne(doc, "//a")
+img := htmlquery.FindOne(a, "//img")
+fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
+```
+
 #### Evaluate the number of all IMG element.

 ```go
@@ -87,6 +104,30 @@ fmt.Printf("total count is %f", v)
 ```


+Quick Starts
+===
+
+```go
+func main() {
+	doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
+	if err != nil {
+		panic(err)
+	}
+	// Find all news item.
+	list, err := htmlquery.QueryAll(doc, "//ol/li")
+	if err != nil {
+		panic(err)
+	}
+	for i, n := range list {
+		a := htmlquery.FindOne(n, "//a")
+		if a != nil {
+		    fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
+		}
+	}
+}
+```
+
+
 FAQ
 ====

@@ -117,52 +158,6 @@ BenchmarkDisableSelectorCache-4           500000              3162 ns/op
 htmlquery.DisableSelectorCache = true
 ```

-Changelogs
-===
-
-2019-11-19 
- Add built-in query object cache feature, avoid re-compilation for the same query string. [#16](https://github.com/antchfx/htmlquery/issues/16)
- Added LoadDoc [18](https://github.com/antchfx/htmlquery/pull/18)
-
-2019-10-05 
- Add new methods that compatible with invalid XPath expression error: `QueryAll` and `Query`.
- Add `QuerySelector` and `QuerySelectorAll` methods, supported reused your query object.
-
-2019-02-04
- [#7](https://github.com/antchfx/htmlquery/issues/7) Removed deprecated `FindEach()` and `FindEachWithBreak()` methods.
-
-2018-12-28
- Avoid adding duplicate elements to list for `Find()` method. [#6](https://github.com/antchfx/htmlquery/issues/6)
-
-Tutorial
-===
-
-```go
-func main() {
-	doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
-	if err != nil {
-		panic(err)
-	}
-	// Find all news item.
-	list, err := htmlquery.QueryAll(doc, "//ol/li")
-	if err != nil {
-		panic(err)
-	}
-	for i, n := range list {
-		a := htmlquery.FindOne(n, "//a")
-		fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
-	}
-}
-```
-
-List of supported XPath query packages
-===
-| Name                                              | Description                               |
-| ------------------------------------------------- | ----------------------------------------- |
-| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
-| [xmlquery](https://github.com/antchfx/xmlquery)   | XPath query package for the XML document  |
-| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
-
 Questions
 ===
 Please let me know if you have any questions.
--- a/vendor/github.com/antchfx/htmlquery/query.go
+++ b/vendor/github.com/antchfx/htmlquery/query.go
@@ -55,10 +55,10 @@ func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
 	return nodes, nil
 }

-// Query searches the html.Node that matches by the specified XPath expr,
-// and return the first element of matched html.Node.
+// Query runs the given XPath expression against the given html.Node and
+// returns the first matching html.Node, or nil if no matches are found.
 //
-// Return an error if the expression `expr` cannot be parsed.
+// Returns an error if the expression `expr` cannot be parsed.
 func Query(top *html.Node, expr string) (*html.Node, error) {
 	exp, err := getQuery(expr)
 	if err != nil {
@@ -83,11 +83,6 @@ func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
 	for t.MoveNext() {
 		nav := t.Current().(*NodeNavigator)
 		n := getCurrentNode(nav)
-		// avoid adding duplicate nodes.
-		if len(elems) > 0 && (elems[0] == n || (nav.NodeType() == xpath.AttributeNode &&
-			nav.LocalName() == elems[0].Data && nav.Value() == InnerText(elems[0]))) {
-			continue
-		}
 		elems = append(elems, n)
 	}
 	return elems
@@ -179,6 +174,19 @@ func SelectAttr(n *html.Node, name string) (val string) {
 	return
 }

+// ExistsAttr returns whether attribute with specified name exists.
+func ExistsAttr(n *html.Node, name string) bool {
+	if n == nil {
+		return false
+	}
+	for _, attr := range n.Attr {
+		if attr.Key == name {
+			return true
+		}
+	}
+	return false
+}
+
 // OutputHTML returns the text including tags name.
 func OutputHTML(n *html.Node, self bool) string {
 	var buf bytes.Buffer