mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 20:34:37 +03:00
Add http headers support to scraper (#1273)
This commit is contained in:
@@ -175,11 +175,17 @@ type clickOptions struct {
|
|||||||
Sleep int `yaml:"sleep"`
|
Sleep int `yaml:"sleep"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type header struct {
|
||||||
|
Key string `yaml:"Key"`
|
||||||
|
Value string `yaml:"Value"`
|
||||||
|
}
|
||||||
|
|
||||||
type scraperDriverOptions struct {
|
type scraperDriverOptions struct {
|
||||||
UseCDP bool `yaml:"useCDP"`
|
UseCDP bool `yaml:"useCDP"`
|
||||||
Sleep int `yaml:"sleep"`
|
Sleep int `yaml:"sleep"`
|
||||||
Clicks []*clickOptions `yaml:"clicks"`
|
Clicks []*clickOptions `yaml:"clicks"`
|
||||||
Cookies []*cookieOptions `yaml:"cookies"`
|
Cookies []*cookieOptions `yaml:"cookies"`
|
||||||
|
Headers []*header `yaml:"headers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
|
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
|
||||||
|
|||||||
@@ -74,12 +74,21 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
|
|||||||
req.Header.Set("User-Agent", userAgent)
|
req.Header.Set("User-Agent", userAgent)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
|
||||||
|
for _, h := range driverOptions.Headers {
|
||||||
|
if h.Key != "" {
|
||||||
|
req.Header.Set(h.Key, h.Value)
|
||||||
|
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if resp.StatusCode >= 400 {
|
if resp.StatusCode >= 400 {
|
||||||
return nil, fmt.Errorf("http error %d", resp.StatusCode)
|
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
|
||||||
}
|
}
|
||||||
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
@@ -156,10 +165,13 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
var res string
|
var res string
|
||||||
|
headers := cdpHeaders(driverOptions)
|
||||||
|
|
||||||
err := chromedp.Run(ctx,
|
err := chromedp.Run(ctx,
|
||||||
network.Enable(),
|
network.Enable(),
|
||||||
setCDPCookies(driverOptions),
|
setCDPCookies(driverOptions),
|
||||||
printCDPCookies(driverOptions, "Cookies found"),
|
printCDPCookies(driverOptions, "Cookies found"),
|
||||||
|
network.SetExtraHTTPHeaders(network.Headers(headers)),
|
||||||
chromedp.Navigate(url),
|
chromedp.Navigate(url),
|
||||||
chromedp.Sleep(sleepDuration),
|
chromedp.Sleep(sleepDuration),
|
||||||
setCDPClicks(driverOptions),
|
setCDPClicks(driverOptions),
|
||||||
@@ -241,3 +253,16 @@ func cdpNetwork(enable bool) chromedp.Action {
|
|||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
|
||||||
|
headers := map[string]interface{}{}
|
||||||
|
if driverOptions.Headers != nil {
|
||||||
|
for _, h := range driverOptions.Headers {
|
||||||
|
if h.Key != "" {
|
||||||
|
headers[h.Key] = h.Value
|
||||||
|
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return headers
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
* Added scene queue.
|
* Added scene queue.
|
||||||
|
|
||||||
### 🎨 Improvements
|
### 🎨 Improvements
|
||||||
|
* Support http request headers in scrapers.
|
||||||
* Sort performers by gender in scene/image/gallery cards and details.
|
* Sort performers by gender in scene/image/gallery cards and details.
|
||||||
* Add popover buttons for scenes/images/galleries on performer/studio/tag cards.
|
* Add popover buttons for scenes/images/galleries on performer/studio/tag cards.
|
||||||
* Add slideshow to image wall view.
|
* Add slideshow to image wall view.
|
||||||
|
|||||||
@@ -544,6 +544,24 @@ When developing a scraper you can have a look at the cookies set by a site by ad
|
|||||||
|
|
||||||
and having a look at the log / console in debug mode.
|
and having a look at the log / console in debug mode.
|
||||||
|
|
||||||
|
### Headers
|
||||||
|
|
||||||
|
Sending request headers is possible when using a scraper.
|
||||||
|
Headers can be set in the `driver` section and are supported for plain, CDP enabled and JSON scrapers.
|
||||||
|
They consist of a Key and a Value. If the the Key is empty or not defined then the header is ignored.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
driver:
|
||||||
|
headers:
|
||||||
|
- Key: User-Agent
|
||||||
|
Value: My Stash Scraper
|
||||||
|
- Key: Authorization
|
||||||
|
Value: Bearer ds3sdfcFdfY17p4qBkTVF03zscUU2glSjWF17bZyoe8
|
||||||
|
```
|
||||||
|
|
||||||
|
* headers are set after stash's `User-Agent` configuration option is applied.
|
||||||
|
This means setting a `User-Agent` header from the scraper overrides the one in the configuration settings.
|
||||||
|
|
||||||
### XPath scraper example
|
### XPath scraper example
|
||||||
|
|
||||||
A performer and scene xpath scraper is shown as an example below:
|
A performer and scene xpath scraper is shown as an example below:
|
||||||
@@ -614,31 +632,42 @@ A performer and scene scraper for ThePornDB is shown below:
|
|||||||
name: ThePornDB
|
name: ThePornDB
|
||||||
performerByName:
|
performerByName:
|
||||||
action: scrapeJson
|
action: scrapeJson
|
||||||
queryURL: https://metadataapi.net/api/performers?q={}
|
queryURL: https://api.metadataapi.net/performers?q={}
|
||||||
scraper: performerSearch
|
scraper: performerSearch
|
||||||
performerByURL:
|
performerByURL:
|
||||||
- action: scrapeJson
|
- action: scrapeJson
|
||||||
url:
|
url:
|
||||||
- https://metadataapi.net/api/performers/
|
- https://api.metadataapi.net/performers/
|
||||||
scraper: performerScraper
|
scraper: performerScraper
|
||||||
sceneByURL:
|
sceneByURL:
|
||||||
- action: scrapeJson
|
- action: scrapeJson
|
||||||
url:
|
url:
|
||||||
- https://metadataapi.net/api/scenes/
|
- https://api.metadataapi.net/scenes/
|
||||||
scraper: sceneScraper
|
scraper: sceneScraper
|
||||||
sceneByFragment:
|
sceneByFragment:
|
||||||
action: scrapeJson
|
action: scrapeJson
|
||||||
queryURL: https://metadataapi.net/api/scenes?parse={filename}&limit=1
|
queryURL: https://api.metadataapi.net/scenes?parse={filename}&hash={oshash}&limit=1
|
||||||
scraper: sceneQueryScraper
|
scraper: sceneQueryScraper
|
||||||
|
queryURLReplace:
|
||||||
|
filename:
|
||||||
|
- regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can contruct a valid url
|
||||||
|
with: "." # "%20"
|
||||||
|
- regex: HEVC
|
||||||
|
with:
|
||||||
|
- regex: x265
|
||||||
|
with:
|
||||||
|
- regex: \.+
|
||||||
|
with: "."
|
||||||
jsonScrapers:
|
jsonScrapers:
|
||||||
performerSearch:
|
performerSearch:
|
||||||
performer:
|
performer:
|
||||||
Name: data.#.name
|
Name: data.#.name
|
||||||
URL:
|
URL:
|
||||||
selector: data.#.id
|
selector: data.#.id
|
||||||
replace:
|
postProcess:
|
||||||
|
- replace:
|
||||||
- regex: ^
|
- regex: ^
|
||||||
with: https://metadataapi.net/api/performers/
|
with: https://api.metadataapi.net/performers/
|
||||||
|
|
||||||
performerScraper:
|
performerScraper:
|
||||||
common:
|
common:
|
||||||
@@ -648,7 +677,12 @@ jsonScrapers:
|
|||||||
Gender: $extras.gender
|
Gender: $extras.gender
|
||||||
Birthdate: $extras.birthday
|
Birthdate: $extras.birthday
|
||||||
Ethnicity: $extras.ethnicity
|
Ethnicity: $extras.ethnicity
|
||||||
Height: $extras.height
|
Height:
|
||||||
|
selector: $extras.height
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: cm
|
||||||
|
with:
|
||||||
Measurements: $extras.measurements
|
Measurements: $extras.measurements
|
||||||
Tattoos: $extras.tattoos
|
Tattoos: $extras.tattoos
|
||||||
Piercings: $extras.piercings
|
Piercings: $extras.piercings
|
||||||
@@ -687,6 +721,13 @@ jsonScrapers:
|
|||||||
Name: $data.site.name
|
Name: $data.site.name
|
||||||
Tags:
|
Tags:
|
||||||
Name: $data.tags.#.tag
|
Name: $data.tags.#.tag
|
||||||
|
driver:
|
||||||
|
headers:
|
||||||
|
- Key: User-Agent
|
||||||
|
Value: Stash JSON Scraper
|
||||||
|
- Key: Authorization
|
||||||
|
Value: Bearer lPdwFdfY17p4qBkTVF03zscUU2glSjdf17bZyoe # use an actual API Key here
|
||||||
|
# Last Updated April 7, 2021
|
||||||
```
|
```
|
||||||
|
|
||||||
## Object fields
|
## Object fields
|
||||||
|
|||||||
Reference in New Issue
Block a user