Add support for setting cookies in the scraper (#934)

This commit is contained in:
bnkai
2020-12-01 07:34:09 +02:00
committed by GitHub
parent aecbd236bc
commit a96ab9ce6f
5 changed files with 230 additions and 4 deletions

View File

@@ -157,9 +157,22 @@ type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}
type scraperCookies struct {
Name string `yaml:"Name"`
Value string `yaml:"Value"`
Domain string `yaml:"Domain"`
Path string `yaml:"Path"`
}
type cookieOptions struct {
CookieURL string `yaml:"CookieURL"`
Cookies []*scraperCookies `yaml:"Cookies"`
}
type scraperDriverOptions struct {
UseCDP bool `yaml:"useCDP"`
Sleep int `yaml:"sleep"`
UseCDP bool `yaml:"useCDP"`
Sleep int `yaml:"sleep"`
Cookies []*cookieOptions `yaml:"cookies"`
}
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {

142
pkg/scraper/cookies.go Normal file
View File

@@ -0,0 +1,142 @@
package scraper
import (
"context"
"fmt"
"net/http"
"net/http/cookiejar"
"net/url"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/stashapp/stash/pkg/logger"
)
// set cookies for the native http client
func setCookies(jar *cookiejar.Jar, scraperConfig config) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && !driverOptions.UseCDP {
var foundURLs []*url.URL
for _, ckURL := range driverOptions.Cookies { // go through all cookies
url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema
if err != nil {
logger.Warnf("Skipping jar cookies for cookieURL %s. Error %s", ckURL.CookieURL, err)
} else {
var httpCookies []*http.Cookie
var httpCookie *http.Cookie
for _, cookie := range ckURL.Cookies {
httpCookie = &http.Cookie{
Name: cookie.Name,
Value: cookie.Value,
Path: cookie.Path,
Domain: cookie.Domain,
}
httpCookies = append(httpCookies, httpCookie)
}
jar.SetCookies(url, httpCookies) // jar.SetCookies only sets cookies with the domain matching the URL
if jar.Cookies(url) == nil {
logger.Warnf("Setting jar cookies for %s failed", url.String())
} else {
foundURLs = append(foundURLs, url)
}
}
}
}
}
// print all cookies from the jar of the native http client
func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && !driverOptions.UseCDP {
var foundURLs []*url.URL
for _, ckURL := range driverOptions.Cookies { // go through all cookies
url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema
if err == nil {
foundURLs = append(foundURLs, url)
}
}
if len(foundURLs) > 0 {
logger.Debugf("%s\n", msg)
printJarCookies(jar, foundURLs)
}
}
}
// print all cookies from the jar of the native http client for given urls
func printJarCookies(jar *cookiejar.Jar, urls []*url.URL) {
for _, url := range urls {
logger.Debugf("Jar cookies for %s", url.String())
for i, cookie := range jar.Cookies(url) {
logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\"", i, cookie.Name, cookie.Value)
}
}
}
// set all cookies listed in the scraper config
func setCDPCookies(driverOptions scraperDriverOptions) chromedp.Tasks {
return chromedp.Tasks{
chromedp.ActionFunc(func(ctx context.Context) error {
// create cookie expiration
expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
for _, ckURL := range driverOptions.Cookies {
for _, cookie := range ckURL.Cookies {
success, err := network.SetCookie(cookie.Name, cookie.Value).
WithExpires(&expr).
WithDomain(cookie.Domain).
WithPath(cookie.Path).
WithHTTPOnly(false).
WithSecure(false).
Do(ctx)
if err != nil {
return err
}
if !success {
return fmt.Errorf("could not set chrome cookie %s", cookie.Name)
}
}
}
return nil
}),
}
}
// print cookies whose domain is included in the scraper config
func printCDPCookies(driverOptions scraperDriverOptions, msg string) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
chromeCookies, err := network.GetAllCookies().Do(ctx)
if err != nil {
return err
}
scraperDomains := make(map[string]struct{})
for _, ckURL := range driverOptions.Cookies {
for _, cookie := range ckURL.Cookies {
scraperDomains[cookie.Domain] = struct{}{}
}
}
if len(scraperDomains) > 0 { // only print the cookies if they are listed in the scraper
logger.Debugf("%s\n", msg)
for i, cookie := range chromeCookies {
_, ok := scraperDomains[cookie.Domain]
if ok {
logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\" Domain: \"%s\"", i, cookie.Name, cookie.Value, cookie.Domain)
}
}
}
return nil
})
}

View File

@@ -17,9 +17,10 @@ import (
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"github.com/stashapp/stash/pkg/logger"
"golang.org/x/net/html/charset"
"golang.org/x/net/publicsuffix"
"github.com/stashapp/stash/pkg/logger"
)
// Timeout for the scrape http request. Includes transfer time. May want to make this
@@ -42,6 +43,9 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
return nil, er
}
setCookies(jar, scraperConfig)
printCookies(jar, scraperConfig, "Jar cookies set from scraper")
client := &http.Client{
Timeout: scrapeGetTimeout,
// defaultCheckRedirect code with max changed from 10 to 20
@@ -76,6 +80,7 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
}
bodyReader := bytes.NewReader(body)
printCookies(jar, scraperConfig, "Jar cookies found for scraper urls")
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
}
@@ -140,6 +145,8 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
var res string
err := chromedp.Run(ctx,
network.Enable(),
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
chromedp.ActionFunc(func(ctx context.Context) error {
@@ -150,6 +157,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
printCDPCookies(driverOptions, "Cookies set"),
)
if err != nil {
return nil, err

View File

@@ -1,4 +1,5 @@
### 🎨 Improvements
* Add support for setting cookies in scrapers.
* Truncate long text and show on hover.
* Show scene studio as text where image is missing.
* Use natural sort for titles and movie names.

View File

@@ -395,7 +395,69 @@ Optionally, you can add a `sleep` value under the `driver` section. This specifi
When `useCDP` is set to true, stash will execute or connect to an instance of Chrome. The behaviour is dictated by the `Chrome CDP path` setting in the user configuration. If left empty, stash will attempt to find the Chrome executable in the path environment, and will fail if it cannot find one.
`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`).
`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). As remote instance a docker container can also be used with the `chromedp/headless-shell` image being highly recommended.
### Cookie support
In some websites the use of cookies is needed to bypass a welcoming message or some other kind of protection. Stash supports the setting of cookies for the direct xpath scraper and the CDP based one. Due to implementation issues the usage varies a bit.
To use the cookie functionality a `cookies` sub section needs to be added to the `driver` section.
Each cookie element can consist of a `CookieURL` and a number of `Cookies`.
* `CookieURL` is only needed if you are using the direct / native scraper method. It is the request url that we expect from the site we scrape. It must be in the same domain as the cookies we try to set otherwise all cookies in the same group will fail to set. If the `CookieURL` is not a valid URL then again the cookies of that group will fail.
* `Cookies` are the actual cookies we set. When using CDP that's the only part required. They have `Name`, `Value`, `Domain`, `Path` values.
In the following example we use cookies for a site using the direct / native xpath scraper. We expect requests to come from `https://www.example.com` and `https://api.somewhere.com` that look for a `_warning` and a `_warn` cookie. A `_test2` cookie is also set just as a demo.
```yaml
driver:
cookies:
- CookieURL: "https://www.example.com"
Cookies:
- Name: "_warning"
Domain: ".example.com"
Value: "true"
Path: "/"
- Name: "_test2"
Value: "123412"
Domain: ".example.com"
Path: "/"
- CookieURL: "https://api.somewhere.com"
Cookies:
- Name: "_warn"
Value: "123"
Domain: ".somewhere.com"
```
The same functionality when using CDP would look like this:
```yaml
driver:
useCDP: true
cookies:
- Cookies:
- Name: "_warning"
Domain: ".example.com"
Value: "true"
Path: "/"
- Name: "_test2"
Value: "123412"
Domain: ".example.com"
Path: "/"
- Cookies:
- Name: "_warn"
Value: "123"
Domain: ".somewhere.com"
```
When developing a scraper you can have a look at the cookies set by a site by adding
* a `CookieURL` if you use the direct xpath scraper
* a `Domain` if you use the CDP scraper
and having a look at the log / console in debug mode.
### XPath scraper example