mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 20:34:37 +03:00
Configurable scraper user agent string (#409)
* Add debug scrape option. Co-authored-by: HiddenPants255 <>
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"reflect"
|
||||
"regexp"
|
||||
@@ -10,11 +12,17 @@ import (
|
||||
|
||||
"github.com/antchfx/htmlquery"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/charset"
|
||||
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/manager/config"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
)
|
||||
|
||||
// Timeout for the scrape http request. Includes transfer time. May want to make this
|
||||
// configurable at some point.
|
||||
const scrapeGetTimeout = time.Second * 30
|
||||
|
||||
type commonXPathConfig map[string]string
|
||||
|
||||
func (c commonXPathConfig) applyCommon(src string) string {
|
||||
@@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
|
||||
return value
|
||||
}
|
||||
|
||||
doc, err := htmlquery.LoadURL(value)
|
||||
doc, err := loadURL(value, nil)
|
||||
|
||||
if err != nil {
|
||||
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
||||
@@ -504,6 +512,42 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
|
||||
return r
|
||||
}
|
||||
|
||||
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
|
||||
client := &http.Client{
|
||||
Timeout: scrapeGetTimeout,
|
||||
}
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
userAgent := config.GetScraperUserAgent()
|
||||
if userAgent != "" {
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ret, err := html.Parse(r)
|
||||
|
||||
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
|
||||
var b bytes.Buffer
|
||||
html.Render(&b, ret)
|
||||
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
|
||||
}
|
||||
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
||||
|
||||
@@ -511,7 +555,7 @@ func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPe
|
||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := htmlquery.LoadURL(url)
|
||||
doc, err := loadURL(url, c.scraperConfig)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
|
||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := htmlquery.LoadURL(url)
|
||||
doc, err := loadURL(url, c.scraperConfig)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
|
||||
u := c.QueryURL
|
||||
u = strings.Replace(u, placeholder, escapedName, -1)
|
||||
|
||||
doc, err := htmlquery.LoadURL(u)
|
||||
doc, err := loadURL(u, c.scraperConfig)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
Reference in New Issue
Block a user