mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Increase xpath redirects, use cookies (#624)
This commit is contained in:
2
go.mod
2
go.mod
@@ -25,7 +25,7 @@ require (
|
|||||||
github.com/vektah/gqlparser v1.1.2
|
github.com/vektah/gqlparser v1.1.2
|
||||||
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
|
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
|
||||||
golang.org/x/image v0.0.0-20190118043309-183bebdce1b2
|
golang.org/x/image v0.0.0-20190118043309-183bebdce1b2
|
||||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
|
golang.org/x/net v0.0.0-20200602114024-627f9648deb9
|
||||||
gopkg.in/yaml.v2 v2.2.2
|
gopkg.in/yaml.v2 v2.2.2
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -676,6 +676,8 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgP
|
|||||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI=
|
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI=
|
||||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||||
|
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
|
||||||
|
golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||||
golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||||
golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"errors"
|
"errors"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/http/cookiejar"
|
||||||
"net/url"
|
"net/url"
|
||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -13,6 +14,7 @@ import (
|
|||||||
"github.com/antchfx/htmlquery"
|
"github.com/antchfx/htmlquery"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
|
"golang.org/x/net/publicsuffix"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
"github.com/stashapp/stash/pkg/manager/config"
|
||||||
@@ -587,8 +589,24 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
|
func loadURL(url string, c *scraperConfig) (*html.Node, error) {
|
||||||
|
options := cookiejar.Options{
|
||||||
|
PublicSuffixList: publicsuffix.List,
|
||||||
|
}
|
||||||
|
jar, er := cookiejar.New(&options)
|
||||||
|
if er != nil {
|
||||||
|
return nil, er
|
||||||
|
}
|
||||||
|
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
Timeout: scrapeGetTimeout,
|
Timeout: scrapeGetTimeout,
|
||||||
|
// defaultCheckRedirect code with max changed from 10 to 20
|
||||||
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||||
|
if len(via) >= 20 {
|
||||||
|
return errors.New("stopped after 20 redirects")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
Jar: jar,
|
||||||
}
|
}
|
||||||
req, err := http.NewRequest("GET", url, nil)
|
req, err := http.NewRequest("GET", url, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
181
vendor/golang.org/x/net/publicsuffix/list.go
generated
vendored
Normal file
181
vendor/golang.org/x/net/publicsuffix/list.go
generated
vendored
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
// Copyright 2012 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//go:generate go run gen.go
|
||||||
|
|
||||||
|
// Package publicsuffix provides a public suffix list based on data from
|
||||||
|
// https://publicsuffix.org/
|
||||||
|
//
|
||||||
|
// A public suffix is one under which Internet users can directly register
|
||||||
|
// names. It is related to, but different from, a TLD (top level domain).
|
||||||
|
//
|
||||||
|
// "com" is a TLD (top level domain). Top level means it has no dots.
|
||||||
|
//
|
||||||
|
// "com" is also a public suffix. Amazon and Google have registered different
|
||||||
|
// siblings under that domain: "amazon.com" and "google.com".
|
||||||
|
//
|
||||||
|
// "au" is another TLD, again because it has no dots. But it's not "amazon.au".
|
||||||
|
// Instead, it's "amazon.com.au".
|
||||||
|
//
|
||||||
|
// "com.au" isn't an actual TLD, because it's not at the top level (it has
|
||||||
|
// dots). But it is an eTLD (effective TLD), because that's the branching point
|
||||||
|
// for domain name registrars.
|
||||||
|
//
|
||||||
|
// Another name for "an eTLD" is "a public suffix". Often, what's more of
|
||||||
|
// interest is the eTLD+1, or one more label than the public suffix. For
|
||||||
|
// example, browsers partition read/write access to HTTP cookies according to
|
||||||
|
// the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
|
||||||
|
// "google.com.au", but web pages served from "maps.google.com" can share
|
||||||
|
// cookies from "www.google.com", so you don't have to sign into Google Maps
|
||||||
|
// separately from signing into Google Web Search. Note that all four of those
|
||||||
|
// domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
|
||||||
|
// the last two are not (but share the same eTLD+1: "google.com").
|
||||||
|
//
|
||||||
|
// All of these domains have the same eTLD+1:
|
||||||
|
// - "www.books.amazon.co.uk"
|
||||||
|
// - "books.amazon.co.uk"
|
||||||
|
// - "amazon.co.uk"
|
||||||
|
// Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
|
||||||
|
//
|
||||||
|
// There is no closed form algorithm to calculate the eTLD of a domain.
|
||||||
|
// Instead, the calculation is data driven. This package provides a
|
||||||
|
// pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
|
||||||
|
// https://publicsuffix.org/
|
||||||
|
package publicsuffix // import "golang.org/x/net/publicsuffix"
|
||||||
|
|
||||||
|
// TODO: specify case sensitivity and leading/trailing dot behavior for
|
||||||
|
// func PublicSuffix and func EffectiveTLDPlusOne.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http/cookiejar"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// List implements the cookiejar.PublicSuffixList interface by calling the
|
||||||
|
// PublicSuffix function.
|
||||||
|
var List cookiejar.PublicSuffixList = list{}
|
||||||
|
|
||||||
|
type list struct{}
|
||||||
|
|
||||||
|
func (list) PublicSuffix(domain string) string {
|
||||||
|
ps, _ := PublicSuffix(domain)
|
||||||
|
return ps
|
||||||
|
}
|
||||||
|
|
||||||
|
func (list) String() string {
|
||||||
|
return version
|
||||||
|
}
|
||||||
|
|
||||||
|
// PublicSuffix returns the public suffix of the domain using a copy of the
|
||||||
|
// publicsuffix.org database compiled into the library.
|
||||||
|
//
|
||||||
|
// icann is whether the public suffix is managed by the Internet Corporation
|
||||||
|
// for Assigned Names and Numbers. If not, the public suffix is either a
|
||||||
|
// privately managed domain (and in practice, not a top level domain) or an
|
||||||
|
// unmanaged top level domain (and not explicitly mentioned in the
|
||||||
|
// publicsuffix.org list). For example, "foo.org" and "foo.co.uk" are ICANN
|
||||||
|
// domains, "foo.dyndns.org" and "foo.blogspot.co.uk" are private domains and
|
||||||
|
// "cromulent" is an unmanaged top level domain.
|
||||||
|
//
|
||||||
|
// Use cases for distinguishing ICANN domains like "foo.com" from private
|
||||||
|
// domains like "foo.appspot.com" can be found at
|
||||||
|
// https://wiki.mozilla.org/Public_Suffix_List/Use_Cases
|
||||||
|
func PublicSuffix(domain string) (publicSuffix string, icann bool) {
|
||||||
|
lo, hi := uint32(0), uint32(numTLD)
|
||||||
|
s, suffix, icannNode, wildcard := domain, len(domain), false, false
|
||||||
|
loop:
|
||||||
|
for {
|
||||||
|
dot := strings.LastIndex(s, ".")
|
||||||
|
if wildcard {
|
||||||
|
icann = icannNode
|
||||||
|
suffix = 1 + dot
|
||||||
|
}
|
||||||
|
if lo == hi {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
f := find(s[1+dot:], lo, hi)
|
||||||
|
if f == notFound {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
u := nodes[f] >> (nodesBitsTextOffset + nodesBitsTextLength)
|
||||||
|
icannNode = u&(1<<nodesBitsICANN-1) != 0
|
||||||
|
u >>= nodesBitsICANN
|
||||||
|
u = children[u&(1<<nodesBitsChildren-1)]
|
||||||
|
lo = u & (1<<childrenBitsLo - 1)
|
||||||
|
u >>= childrenBitsLo
|
||||||
|
hi = u & (1<<childrenBitsHi - 1)
|
||||||
|
u >>= childrenBitsHi
|
||||||
|
switch u & (1<<childrenBitsNodeType - 1) {
|
||||||
|
case nodeTypeNormal:
|
||||||
|
suffix = 1 + dot
|
||||||
|
case nodeTypeException:
|
||||||
|
suffix = 1 + len(s)
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
u >>= childrenBitsNodeType
|
||||||
|
wildcard = u&(1<<childrenBitsWildcard-1) != 0
|
||||||
|
if !wildcard {
|
||||||
|
icann = icannNode
|
||||||
|
}
|
||||||
|
|
||||||
|
if dot == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
s = s[:dot]
|
||||||
|
}
|
||||||
|
if suffix == len(domain) {
|
||||||
|
// If no rules match, the prevailing rule is "*".
|
||||||
|
return domain[1+strings.LastIndex(domain, "."):], icann
|
||||||
|
}
|
||||||
|
return domain[suffix:], icann
|
||||||
|
}
|
||||||
|
|
||||||
|
const notFound uint32 = 1<<32 - 1
|
||||||
|
|
||||||
|
// find returns the index of the node in the range [lo, hi) whose label equals
|
||||||
|
// label, or notFound if there is no such node. The range is assumed to be in
|
||||||
|
// strictly increasing node label order.
|
||||||
|
func find(label string, lo, hi uint32) uint32 {
|
||||||
|
for lo < hi {
|
||||||
|
mid := lo + (hi-lo)/2
|
||||||
|
s := nodeLabel(mid)
|
||||||
|
if s < label {
|
||||||
|
lo = mid + 1
|
||||||
|
} else if s == label {
|
||||||
|
return mid
|
||||||
|
} else {
|
||||||
|
hi = mid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return notFound
|
||||||
|
}
|
||||||
|
|
||||||
|
// nodeLabel returns the label for the i'th node.
|
||||||
|
func nodeLabel(i uint32) string {
|
||||||
|
x := nodes[i]
|
||||||
|
length := x & (1<<nodesBitsTextLength - 1)
|
||||||
|
x >>= nodesBitsTextLength
|
||||||
|
offset := x & (1<<nodesBitsTextOffset - 1)
|
||||||
|
return text[offset : offset+length]
|
||||||
|
}
|
||||||
|
|
||||||
|
// EffectiveTLDPlusOne returns the effective top level domain plus one more
|
||||||
|
// label. For example, the eTLD+1 for "foo.bar.golang.org" is "golang.org".
|
||||||
|
func EffectiveTLDPlusOne(domain string) (string, error) {
|
||||||
|
if strings.HasPrefix(domain, ".") || strings.HasSuffix(domain, ".") || strings.Contains(domain, "..") {
|
||||||
|
return "", fmt.Errorf("publicsuffix: empty label in domain %q", domain)
|
||||||
|
}
|
||||||
|
|
||||||
|
suffix, _ := PublicSuffix(domain)
|
||||||
|
if len(domain) <= len(suffix) {
|
||||||
|
return "", fmt.Errorf("publicsuffix: cannot derive eTLD+1 for domain %q", domain)
|
||||||
|
}
|
||||||
|
i := len(domain) - len(suffix) - 1
|
||||||
|
if domain[i] != '.' {
|
||||||
|
return "", fmt.Errorf("publicsuffix: invalid public suffix %q for domain %q", suffix, domain)
|
||||||
|
}
|
||||||
|
return domain[1+strings.LastIndex(domain[:i], "."):], nil
|
||||||
|
}
|
||||||
10150
vendor/golang.org/x/net/publicsuffix/table.go
generated
vendored
Normal file
10150
vendor/golang.org/x/net/publicsuffix/table.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3
vendor/modules.txt
vendored
3
vendor/modules.txt
vendored
@@ -191,11 +191,12 @@ golang.org/x/image/tiff/lzw
|
|||||||
golang.org/x/image/vp8
|
golang.org/x/image/vp8
|
||||||
golang.org/x/image/vp8l
|
golang.org/x/image/vp8l
|
||||||
golang.org/x/image/webp
|
golang.org/x/image/webp
|
||||||
# golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
|
# golang.org/x/net v0.0.0-20200602114024-627f9648deb9
|
||||||
golang.org/x/net/context/ctxhttp
|
golang.org/x/net/context/ctxhttp
|
||||||
golang.org/x/net/html
|
golang.org/x/net/html
|
||||||
golang.org/x/net/html/atom
|
golang.org/x/net/html/atom
|
||||||
golang.org/x/net/html/charset
|
golang.org/x/net/html/charset
|
||||||
|
golang.org/x/net/publicsuffix
|
||||||
# golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
|
# golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
|
||||||
golang.org/x/sys/unix
|
golang.org/x/sys/unix
|
||||||
golang.org/x/sys/windows
|
golang.org/x/sys/windows
|
||||||
|
|||||||
Reference in New Issue
Block a user