diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index 4dca0f58e..26c01ba46 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -157,9 +157,22 @@ type scraperDebugOptions struct { PrintHTML bool `yaml:"printHTML"` } +type scraperCookies struct { + Name string `yaml:"Name"` + Value string `yaml:"Value"` + Domain string `yaml:"Domain"` + Path string `yaml:"Path"` +} + +type cookieOptions struct { + CookieURL string `yaml:"CookieURL"` + Cookies []*scraperCookies `yaml:"Cookies"` +} + type scraperDriverOptions struct { - UseCDP bool `yaml:"useCDP"` - Sleep int `yaml:"sleep"` + UseCDP bool `yaml:"useCDP"` + Sleep int `yaml:"sleep"` + Cookies []*cookieOptions `yaml:"cookies"` } func loadScraperFromYAML(id string, reader io.Reader) (*config, error) { diff --git a/pkg/scraper/cookies.go b/pkg/scraper/cookies.go new file mode 100644 index 000000000..0e3ae3d70 --- /dev/null +++ b/pkg/scraper/cookies.go @@ -0,0 +1,142 @@ +package scraper + +import ( + "context" + "fmt" + "net/http" + "net/http/cookiejar" + "net/url" + "time" + + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/cdproto/network" + "github.com/chromedp/chromedp" + + "github.com/stashapp/stash/pkg/logger" +) + +// set cookies for the native http client +func setCookies(jar *cookiejar.Jar, scraperConfig config) { + driverOptions := scraperConfig.DriverOptions + if driverOptions != nil && !driverOptions.UseCDP { + var foundURLs []*url.URL + + for _, ckURL := range driverOptions.Cookies { // go through all cookies + url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema + if err != nil { + logger.Warnf("Skipping jar cookies for cookieURL %s. Error %s", ckURL.CookieURL, err) + } else { + var httpCookies []*http.Cookie + var httpCookie *http.Cookie + + for _, cookie := range ckURL.Cookies { + httpCookie = &http.Cookie{ + Name: cookie.Name, + Value: cookie.Value, + Path: cookie.Path, + Domain: cookie.Domain, + } + + httpCookies = append(httpCookies, httpCookie) + } + jar.SetCookies(url, httpCookies) // jar.SetCookies only sets cookies with the domain matching the URL + + if jar.Cookies(url) == nil { + logger.Warnf("Setting jar cookies for %s failed", url.String()) + } else { + + foundURLs = append(foundURLs, url) + } + } + + } + } +} + +// print all cookies from the jar of the native http client +func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) { + driverOptions := scraperConfig.DriverOptions + if driverOptions != nil && !driverOptions.UseCDP { + var foundURLs []*url.URL + + for _, ckURL := range driverOptions.Cookies { // go through all cookies + url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema + if err == nil { + foundURLs = append(foundURLs, url) + } + } + if len(foundURLs) > 0 { + logger.Debugf("%s\n", msg) + printJarCookies(jar, foundURLs) + + } + } +} + +// print all cookies from the jar of the native http client for given urls +func printJarCookies(jar *cookiejar.Jar, urls []*url.URL) { + for _, url := range urls { + logger.Debugf("Jar cookies for %s", url.String()) + for i, cookie := range jar.Cookies(url) { + logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\"", i, cookie.Name, cookie.Value) + } + } +} + +// set all cookies listed in the scraper config +func setCDPCookies(driverOptions scraperDriverOptions) chromedp.Tasks { + return chromedp.Tasks{ + chromedp.ActionFunc(func(ctx context.Context) error { + // create cookie expiration + expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour)) + + for _, ckURL := range driverOptions.Cookies { + for _, cookie := range ckURL.Cookies { + success, err := network.SetCookie(cookie.Name, cookie.Value). + WithExpires(&expr). + WithDomain(cookie.Domain). + WithPath(cookie.Path). + WithHTTPOnly(false). + WithSecure(false). + Do(ctx) + if err != nil { + return err + } + if !success { + return fmt.Errorf("could not set chrome cookie %s", cookie.Name) + } + + } + } + return nil + }), + } +} + +// print cookies whose domain is included in the scraper config +func printCDPCookies(driverOptions scraperDriverOptions, msg string) chromedp.Action { + return chromedp.ActionFunc(func(ctx context.Context) error { + chromeCookies, err := network.GetAllCookies().Do(ctx) + if err != nil { + return err + } + + scraperDomains := make(map[string]struct{}) + for _, ckURL := range driverOptions.Cookies { + for _, cookie := range ckURL.Cookies { + scraperDomains[cookie.Domain] = struct{}{} + } + } + + if len(scraperDomains) > 0 { // only print the cookies if they are listed in the scraper + logger.Debugf("%s\n", msg) + for i, cookie := range chromeCookies { + _, ok := scraperDomains[cookie.Domain] + if ok { + logger.Debugf("[%d]: Name: \"%s\" Value: \"%s\" Domain: \"%s\"", i, cookie.Name, cookie.Value, cookie.Domain) + } + } + } + return nil + }) +} diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index fa4ae44c4..9fd9d19e5 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -17,9 +17,10 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" jsoniter "github.com/json-iterator/go" - "github.com/stashapp/stash/pkg/logger" "golang.org/x/net/html/charset" "golang.org/x/net/publicsuffix" + + "github.com/stashapp/stash/pkg/logger" ) // Timeout for the scrape http request. Includes transfer time. May want to make this @@ -42,6 +43,9 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re return nil, er } + setCookies(jar, scraperConfig) + printCookies(jar, scraperConfig, "Jar cookies set from scraper") + client := &http.Client{ Timeout: scrapeGetTimeout, // defaultCheckRedirect code with max changed from 10 to 20 @@ -76,6 +80,7 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re } bodyReader := bytes.NewReader(body) + printCookies(jar, scraperConfig, "Jar cookies found for scraper urls") return charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) } @@ -140,6 +145,8 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo var res string err := chromedp.Run(ctx, network.Enable(), + setCDPCookies(driverOptions), + printCDPCookies(driverOptions, "Cookies found"), chromedp.Navigate(url), chromedp.Sleep(sleepDuration), chromedp.ActionFunc(func(ctx context.Context) error { @@ -150,6 +157,7 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx) return err }), + printCDPCookies(driverOptions, "Cookies set"), ) if err != nil { return nil, err diff --git a/ui/v2.5/src/components/Changelog/versions/v050.md b/ui/v2.5/src/components/Changelog/versions/v050.md index 9baae1a1b..5a0a8cad6 100644 --- a/ui/v2.5/src/components/Changelog/versions/v050.md +++ b/ui/v2.5/src/components/Changelog/versions/v050.md @@ -1,4 +1,5 @@ ### 🎨 Improvements +* Add support for setting cookies in scrapers. * Truncate long text and show on hover. * Show scene studio as text where image is missing. * Use natural sort for titles and movie names. diff --git a/ui/v2.5/src/docs/en/Scraping.md b/ui/v2.5/src/docs/en/Scraping.md index 10207f8a6..544d25fc8 100644 --- a/ui/v2.5/src/docs/en/Scraping.md +++ b/ui/v2.5/src/docs/en/Scraping.md @@ -395,7 +395,69 @@ Optionally, you can add a `sleep` value under the `driver` section. This specifi When `useCDP` is set to true, stash will execute or connect to an instance of Chrome. The behaviour is dictated by the `Chrome CDP path` setting in the user configuration. If left empty, stash will attempt to find the Chrome executable in the path environment, and will fail if it cannot find one. -`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). +`Chrome CDP path` can be set to a path to the chrome executable, or an http(s) address to remote chrome instance (for example: `http://localhost:9222/json/version`). As remote instance a docker container can also be used with the `chromedp/headless-shell` image being highly recommended. + +### Cookie support + +In some websites the use of cookies is needed to bypass a welcoming message or some other kind of protection. Stash supports the setting of cookies for the direct xpath scraper and the CDP based one. Due to implementation issues the usage varies a bit. + +To use the cookie functionality a `cookies` sub section needs to be added to the `driver` section. +Each cookie element can consist of a `CookieURL` and a number of `Cookies`. + +* `CookieURL` is only needed if you are using the direct / native scraper method. It is the request url that we expect from the site we scrape. It must be in the same domain as the cookies we try to set otherwise all cookies in the same group will fail to set. If the `CookieURL` is not a valid URL then again the cookies of that group will fail. + +* `Cookies` are the actual cookies we set. When using CDP that's the only part required. They have `Name`, `Value`, `Domain`, `Path` values. + +In the following example we use cookies for a site using the direct / native xpath scraper. We expect requests to come from `https://www.example.com` and `https://api.somewhere.com` that look for a `_warning` and a `_warn` cookie. A `_test2` cookie is also set just as a demo. + +```yaml +driver: + cookies: + - CookieURL: "https://www.example.com" + Cookies: + - Name: "_warning" + Domain: ".example.com" + Value: "true" + Path: "/" + - Name: "_test2" + Value: "123412" + Domain: ".example.com" + Path: "/" + - CookieURL: "https://api.somewhere.com" + Cookies: + - Name: "_warn" + Value: "123" + Domain: ".somewhere.com" +``` + +The same functionality when using CDP would look like this: + +```yaml +driver: + useCDP: true + cookies: + - Cookies: + - Name: "_warning" + Domain: ".example.com" + Value: "true" + Path: "/" + - Name: "_test2" + Value: "123412" + Domain: ".example.com" + Path: "/" + - Cookies: + - Name: "_warn" + Value: "123" + Domain: ".somewhere.com" +``` + +When developing a scraper you can have a look at the cookies set by a site by adding + +* a `CookieURL` if you use the direct xpath scraper + +* a `Domain` if you use the CDP scraper + +and having a look at the log / console in debug mode. ### XPath scraper example