Configurable scraper user agent string (#409)

* Add debug scrape option.

Co-authored-by: HiddenPants255 <>
This commit is contained in:
WithoutPants
2020-03-21 08:55:15 +11:00
committed by GitHub
parent ff495361d9
commit abf2b49803
10 changed files with 122 additions and 11 deletions

View File

@@ -1,7 +1,9 @@
package scraper
import (
"bytes"
"errors"
"net/http"
"net/url"
"reflect"
"regexp"
@@ -10,11 +12,17 @@ import (
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
@@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
return value
}
doc, err := htmlquery.LoadURL(value)
doc, err := loadURL(value, nil)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
@@ -504,6 +512,42 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
return r
}
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
@@ -511,7 +555,7 @@ func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPe
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
@@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)
if err != nil {
return nil, err
@@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)
doc, err := htmlquery.LoadURL(u)
doc, err := loadURL(u, c.scraperConfig)
if err != nil {
return nil, err