Configurable scraper user agent string (#409)

* Add debug scrape option.

Co-authored-by: HiddenPants255 <>
This commit is contained in:
WithoutPants
2020-03-21 08:55:15 +11:00
committed by GitHub
parent ff495361d9
commit abf2b49803
10 changed files with 122 additions and 11 deletions

View File

@@ -11,6 +11,7 @@ fragment ConfigGeneralData on ConfigGeneralResult {
logLevel logLevel
logAccess logAccess
excludes excludes
scraperUserAgent
} }
fragment ConfigInterfaceData on ConfigInterfaceResult { fragment ConfigInterfaceData on ConfigInterfaceResult {

View File

@@ -32,6 +32,8 @@ input ConfigGeneralInput {
logAccess: Boolean! logAccess: Boolean!
"""Array of file regexp to exclude from Scan""" """Array of file regexp to exclude from Scan"""
excludes: [String!] excludes: [String!]
"""Scraper user agent string"""
scraperUserAgent: String
} }
type ConfigGeneralResult { type ConfigGeneralResult {
@@ -59,6 +61,8 @@ type ConfigGeneralResult {
logAccess: Boolean! logAccess: Boolean!
"""Array of file regexp to exclude from Scan""" """Array of file regexp to exclude from Scan"""
excludes: [String!]! excludes: [String!]!
"""Scraper user agent string"""
scraperUserAgent: String
} }
input ConfigInterfaceInput { input ConfigInterfaceInput {

View File

@@ -76,6 +76,10 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co
config.Set(config.Exclude, input.Excludes) config.Set(config.Exclude, input.Excludes)
} }
if input.ScraperUserAgent != nil {
config.Set(config.ScraperUserAgent, input.ScraperUserAgent)
}
if err := config.Write(); err != nil { if err := config.Write(); err != nil {
return makeConfigGeneralResult(), err return makeConfigGeneralResult(), err
} }

View File

@@ -33,6 +33,8 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
maxTranscodeSize := config.GetMaxTranscodeSize() maxTranscodeSize := config.GetMaxTranscodeSize()
maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize() maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize()
scraperUserAgent := config.GetScraperUserAgent()
return &models.ConfigGeneralResult{ return &models.ConfigGeneralResult{
Stashes: config.GetStashPaths(), Stashes: config.GetStashPaths(),
DatabasePath: config.GetDatabasePath(), DatabasePath: config.GetDatabasePath(),
@@ -46,6 +48,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
LogLevel: config.GetLogLevel(), LogLevel: config.GetLogLevel(),
LogAccess: config.GetLogAccess(), LogAccess: config.GetLogAccess(),
Excludes: config.GetExcludes(), Excludes: config.GetExcludes(),
ScraperUserAgent: &scraperUserAgent,
} }
} }
@@ -59,7 +62,6 @@ func makeConfigInterfaceResult() *models.ConfigInterfaceResult {
cssEnabled := config.GetCSSEnabled() cssEnabled := config.GetCSSEnabled()
language := config.GetLanguage() language := config.GetLanguage()
return &models.ConfigInterfaceResult{ return &models.ConfigInterfaceResult{
SoundOnPreview: &soundOnPreview, SoundOnPreview: &soundOnPreview,
WallShowTitle: &wallShowTitle, WallShowTitle: &wallShowTitle,

View File

@@ -22,7 +22,6 @@ const Password = "password"
const Database = "database" const Database = "database"
const ScrapersPath = "scrapers_path"
const Exclude = "exclude" const Exclude = "exclude"
const MaxTranscodeSize = "max_transcode_size" const MaxTranscodeSize = "max_transcode_size"
@@ -32,6 +31,10 @@ const Host = "host"
const Port = "port" const Port = "port"
const ExternalHost = "external_host" const ExternalHost = "external_host"
// scraping options
const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"
// i18n // i18n
const Language = "language" const Language = "language"
@@ -115,6 +118,10 @@ func GetScrapersPath() string {
return viper.GetString(ScrapersPath) return viper.GetString(ScrapersPath)
} }
func GetScraperUserAgent() string {
return viper.GetString(ScraperUserAgent)
}
func GetHost() string { func GetHost() string {
return viper.GetString(Host) return viper.GetString(Host)
} }

View File

@@ -139,6 +139,10 @@ func (c *scrapeSceneByURLConfig) resolveFn() {
} }
} }
type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}
type scraperConfig struct { type scraperConfig struct {
ID string ID string
Name string `yaml:"name"` Name string `yaml:"name"`
@@ -148,8 +152,9 @@ type scraperConfig struct {
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"` SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"` SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
StashServer *stashServer `yaml:"stashServer"` DebugOptions *scraperDebugOptions `yaml:"debug"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"` StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
} }
func loadScraperFromYAML(path string) (*scraperConfig, error) { func loadScraperFromYAML(path string) (*scraperConfig, error) {

View File

@@ -6,6 +6,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils" "github.com/stashapp/stash/pkg/utils"
) )
@@ -52,8 +53,18 @@ func getImage(url string) (*string, error) {
Timeout: imageGetTimeout, Timeout: imageGetTimeout,
} }
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
// assume is a URL for now // assume is a URL for now
resp, err := client.Get(url) resp, err := client.Do(req)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@@ -1,7 +1,9 @@
package scraper package scraper
import ( import (
"bytes"
"errors" "errors"
"net/http"
"net/url" "net/url"
"reflect" "reflect"
"regexp" "regexp"
@@ -10,11 +12,17 @@ import (
"github.com/antchfx/htmlquery" "github.com/antchfx/htmlquery"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/models"
) )
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
type commonXPathConfig map[string]string type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string { func (c commonXPathConfig) applyCommon(src string) string {
@@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
return value return value
} }
doc, err := htmlquery.LoadURL(value) doc, err := loadURL(value, nil)
if err != nil { if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error()) logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
@@ -504,6 +512,42 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
return r return r
} }
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) { func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper] scraper := c.scraperConfig.XPathScrapers[c.Scraper]
@@ -511,7 +555,7 @@ func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPe
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config") return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
} }
doc, err := htmlquery.LoadURL(url) doc, err := loadURL(url, c.scraperConfig)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config") return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
} }
doc, err := htmlquery.LoadURL(url) doc, err := loadURL(url, c.scraperConfig)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
u := c.QueryURL u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1) u = strings.Replace(u, placeholder, escapedName, -1)
doc, err := htmlquery.LoadURL(u) doc, err := loadURL(u, c.scraperConfig)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@@ -29,6 +29,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
const [logLevel, setLogLevel] = useState<string>("Info"); const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true); const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<string[]>([]); const [excludes, setExcludes] = useState<string[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);
const { data, error, loading } = StashService.useConfiguration(); const { data, error, loading } = StashService.useConfiguration();
@@ -44,7 +45,8 @@ export const SettingsConfigurationPanel: React.FC = () => {
logOut, logOut,
logLevel, logLevel,
logAccess, logAccess,
excludes excludes,
scraperUserAgent
}); });
useEffect(() => { useEffect(() => {
@@ -66,6 +68,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
setLogLevel(conf.general.logLevel); setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess); setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes); setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent ?? undefined);
} }
}, [data, error]); }, [data, error]);
@@ -289,6 +292,22 @@ export const SettingsConfigurationPanel: React.FC = () => {
<hr /> <hr />
<Form.Group id="generated-path">
<h6>Scraping</h6>
<Form.Control
className="col col-sm-6 text-input"
defaultValue={scraperUserAgent}
onChange={(e: React.FormEvent<HTMLInputElement>) =>
setScraperUserAgent(e.currentTarget.value)
}
/>
<Form.Text className="text-muted">
User-Agent string used during scrape http requests
</Form.Text>
</Form.Group>
<hr />
<Form.Group> <Form.Group>
<h4>Authentication</h4> <h4>Authentication</h4>
<Form.Group id="username"> <Form.Group id="username">

View File

@@ -32,6 +32,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
const [logLevel, setLogLevel] = useState<string>("Info"); const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true); const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<(string)[]>([]); const [excludes, setExcludes] = useState<(string)[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);
const { data, error, loading } = StashService.useConfiguration(); const { data, error, loading } = StashService.useConfiguration();
@@ -48,7 +49,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
logLevel, logLevel,
logAccess, logAccess,
excludes, excludes,
scraperUserAgent,
}); });
useEffect(() => { useEffect(() => {
@@ -67,6 +68,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
setLogLevel(conf.general.logLevel); setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess); setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes); setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent);
} }
}, [data, error]); }, [data, error]);
@@ -229,6 +231,18 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
</FormGroup> </FormGroup>
<Divider /> <Divider />
<FormGroup>
<H4>Scraping</H4>
<FormGroup
label="Scraper User-Agent string"
helperText="User-Agent string used during scrape http requests"
>
<InputGroup value={scraperUserAgent} onChange={(e: any) => setScraperUserAgent(e.target.value)} />
</FormGroup>
</FormGroup>
<Divider />
<FormGroup> <FormGroup>
<H4>Authentication</H4> <H4>Authentication</H4>
<FormGroup <FormGroup