Refactor xpath scraper code. Add fixed and map (#616)

* Refactor xpath scraper code
* Make post-process a list
* Add map post-process action
* Add fixed xpath values
* Refactor scrapers into cache
* Refactor into mapped config
* Trim test html
This commit is contained in:
WithoutPants
2020-07-21 14:06:25 +10:00
committed by GitHub
parent f4ae9b09a6
commit 2b9215702e
17 changed files with 1421 additions and 1146 deletions

1
go.sum
View File

@@ -582,6 +582,7 @@ github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DM
github.com/spf13/viper v1.4.0 h1:yXHLWeravcrgGyFSyCgdYpXQ9dR9c/WED3pg1RhxqEU=
github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1 h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=

View File

@@ -3,11 +3,11 @@ package api
import (
"context"
"github.com/stashapp/stash/pkg/scraper"
"github.com/stashapp/stash/pkg/manager"
)
func (r *mutationResolver) ReloadScrapers(ctx context.Context) (bool, error) {
err := scraper.ReloadScrapers()
err := manager.GetInstance().ScraperCache.ReloadScrapers()
if err != nil {
return false, err

View File

@@ -3,6 +3,7 @@ package api
import (
"context"
"github.com/stashapp/stash/pkg/manager"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/scraper"
)
@@ -12,12 +13,12 @@ func (r *queryResolver) ScrapeFreeones(ctx context.Context, performer_name strin
scrapedPerformer := models.ScrapedPerformerInput{
Name: &performer_name,
}
return scraper.GetFreeonesScraper().ScrapePerformer(scrapedPerformer)
return manager.GetInstance().ScraperCache.ScrapePerformer(scraper.FreeonesScraperID, scrapedPerformer)
}
// deprecated
func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query string) ([]string, error) {
scrapedPerformers, err := scraper.GetFreeonesScraper().ScrapePerformerNames(query)
scrapedPerformers, err := manager.GetInstance().ScraperCache.ScrapePerformerList(scraper.FreeonesScraperID, query)
if err != nil {
return nil, err
@@ -33,11 +34,11 @@ func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query s
}
func (r *queryResolver) ListPerformerScrapers(ctx context.Context) ([]*models.Scraper, error) {
return scraper.ListPerformerScrapers()
return manager.GetInstance().ScraperCache.ListPerformerScrapers(), nil
}
func (r *queryResolver) ListSceneScrapers(ctx context.Context) ([]*models.Scraper, error) {
return scraper.ListSceneScrapers()
return manager.GetInstance().ScraperCache.ListSceneScrapers(), nil
}
func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID string, query string) ([]*models.ScrapedPerformer, error) {
@@ -45,21 +46,21 @@ func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID strin
return nil, nil
}
return scraper.ScrapePerformerList(scraperID, query)
return manager.GetInstance().ScraperCache.ScrapePerformerList(scraperID, query)
}
func (r *queryResolver) ScrapePerformer(ctx context.Context, scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return scraper.ScrapePerformer(scraperID, scrapedPerformer)
return manager.GetInstance().ScraperCache.ScrapePerformer(scraperID, scrapedPerformer)
}
func (r *queryResolver) ScrapePerformerURL(ctx context.Context, url string) (*models.ScrapedPerformer, error) {
return scraper.ScrapePerformerURL(url)
return manager.GetInstance().ScraperCache.ScrapePerformerURL(url)
}
func (r *queryResolver) ScrapeScene(ctx context.Context, scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
return scraper.ScrapeScene(scraperID, scene)
return manager.GetInstance().ScraperCache.ScrapeScene(scraperID, scene)
}
func (r *queryResolver) ScrapeSceneURL(ctx context.Context, url string) (*models.ScrapedScene, error) {
return scraper.ScrapeSceneURL(url)
return manager.GetInstance().ScraperCache.ScrapeSceneURL(url)
}

View File

@@ -10,6 +10,7 @@ import (
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/manager/paths"
"github.com/stashapp/stash/pkg/scraper"
"github.com/stashapp/stash/pkg/utils"
)
@@ -20,6 +21,8 @@ type singleton struct {
FFMPEGPath string
FFProbePath string
ScraperCache *scraper.Cache
}
var instance *singleton
@@ -47,6 +50,8 @@ func Initialize() *singleton {
Status: TaskStatus{Status: Idle, Progress: -1},
Paths: paths.NewPaths(),
JSON: &jsonUtils{},
ScraperCache: initScraperCache(),
}
instance.RefreshConfig()
@@ -146,6 +151,20 @@ func initLog() {
logger.Init(config.GetLogFile(), config.GetLogOut(), config.GetLogLevel())
}
func initScraperCache() *scraper.Cache {
scraperConfig := scraper.GlobalConfig{
Path: config.GetScrapersPath(),
UserAgent: config.GetScraperUserAgent(),
}
ret, err := scraper.NewCache(scraperConfig)
if err != nil {
logger.Errorf("Error reading scraper configs: %s", err.Error())
}
return ret
}
func (s *singleton) RefreshConfig() {
s.Paths = paths.NewPaths()
if config.IsValid() {

53
pkg/scraper/action.go Normal file
View File

@@ -0,0 +1,53 @@
package scraper
import "github.com/stashapp/stash/pkg/models"
type scraperAction string
const (
scraperActionScript scraperAction = "script"
scraperActionStash scraperAction = "stash"
scraperActionXPath scraperAction = "scrapeXPath"
)
var allScraperAction = []scraperAction{
scraperActionScript,
scraperActionStash,
scraperActionXPath,
}
func (e scraperAction) IsValid() bool {
switch e {
case scraperActionScript, scraperActionStash, scraperActionXPath:
return true
}
return false
}
type scrapeOptions struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
type scraper interface {
scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error)
scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error)
scrapePerformerByURL(url string) (*models.ScrapedPerformer, error)
scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error)
scrapeSceneByURL(url string) (*models.ScrapedScene, error)
}
func getScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) scraper {
switch scraper.Action {
case scraperActionScript:
return newScriptScraper(scraper, config, globalConfig)
case scraperActionStash:
return newStashScraper(scraper, config, globalConfig)
case scraperActionXPath:
return newXpathScraper(scraper, config, globalConfig)
}
panic("unknown scraper action: " + scraper.Action)
}

View File

@@ -1,6 +1,8 @@
package scraper
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
@@ -11,32 +13,80 @@ import (
"github.com/stashapp/stash/pkg/models"
)
type config struct {
ID string
path string
// The name of the scraper. This is displayed in the UI.
Name string `yaml:"name"`
// Configuration for querying performers by name
PerformerByName *scraperTypeConfig `yaml:"performerByName"`
// Configuration for querying performers by a Performer fragment
PerformerByFragment *scraperTypeConfig `yaml:"performerByFragment"`
// Configuration for querying a performer by a URL
PerformerByURL []*scrapeByURLConfig `yaml:"performerByURL"`
// Configuration for querying scenes by a Scene fragment
SceneByFragment *scraperTypeConfig `yaml:"sceneByFragment"`
// Configuration for querying a scene by a URL
SceneByURL []*scrapeByURLConfig `yaml:"sceneByURL"`
// Scraper debugging options
DebugOptions *scraperDebugOptions `yaml:"debug"`
// Stash server configuration
StashServer *stashServer `yaml:"stashServer"`
// Xpath scraping configurations
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`
}
func (c config) validate() error {
if strings.TrimSpace(c.Name) == "" {
return errors.New("name must not be empty")
}
if c.PerformerByName != nil {
if err := c.PerformerByName.validate(); err != nil {
return err
}
}
if c.PerformerByFragment != nil {
if err := c.PerformerByFragment.validate(); err != nil {
return err
}
}
if c.SceneByFragment != nil {
if err := c.SceneByFragment.validate(); err != nil {
return err
}
}
for _, s := range c.PerformerByURL {
if err := s.validate(); err != nil {
return err
}
}
for _, s := range c.SceneByURL {
if err := s.validate(); err != nil {
return err
}
}
return nil
}
type stashServer struct {
URL string `yaml:"url"`
}
type scraperAction string
const (
scraperActionScript scraperAction = "script"
scraperActionStash scraperAction = "stash"
scraperActionXPath scraperAction = "scrapeXPath"
)
var allScraperAction = []scraperAction{
scraperActionScript,
scraperActionStash,
scraperActionXPath,
}
func (e scraperAction) IsValid() bool {
switch e {
case scraperActionScript, scraperActionStash, scraperActionXPath:
return true
}
return false
}
type scraperTypeConfig struct {
Action scraperAction `yaml:"action"`
Script []string `yaml:"script,flow"`
@@ -44,40 +94,18 @@ type scraperTypeConfig struct {
// for xpath name scraper only
QueryURL string `yaml:"queryURL"`
scraperConfig *scraperConfig
}
type scrapePerformerNamesFunc func(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error)
type performerByNameConfig struct {
scraperTypeConfig `yaml:",inline"`
performScrape scrapePerformerNamesFunc
func (c scraperTypeConfig) validate() error {
if !c.Action.IsValid() {
return fmt.Errorf("%s is not a valid scraper action", c.Action)
}
func (c *performerByNameConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapePerformerNamesScript
} else if c.Action == scraperActionStash {
c.performScrape = scrapePerformerNamesStash
} else if c.Action == scraperActionXPath {
c.performScrape = scrapePerformerNamesXPath
}
if c.Action == scraperActionScript && len(c.Script) == 0 {
return errors.New("script is mandatory for script scraper action")
}
type scrapePerformerFragmentFunc func(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error)
type performerByFragmentConfig struct {
scraperTypeConfig `yaml:",inline"`
performScrape scrapePerformerFragmentFunc
}
func (c *performerByFragmentConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapePerformerFragmentScript
} else if c.Action == scraperActionStash {
c.performScrape = scrapePerformerFragmentStash
}
return nil
}
type scrapeByURLConfig struct {
@@ -85,6 +113,14 @@ type scrapeByURLConfig struct {
URL []string `yaml:"url,flow"`
}
func (c scrapeByURLConfig) validate() error {
if len(c.URL) == 0 {
return errors.New("url is mandatory for scrape by url scrapers")
}
return c.scraperTypeConfig.validate()
}
func (c scrapeByURLConfig) matchesURL(url string) bool {
for _, thisURL := range c.URL {
if strings.Contains(url, thisURL) {
@@ -95,71 +131,12 @@ func (c scrapeByURLConfig) matchesURL(url string) bool {
return false
}
type scrapePerformerByURLFunc func(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error)
type scrapePerformerByURLConfig struct {
scrapeByURLConfig `yaml:",inline"`
performScrape scrapePerformerByURLFunc
}
func (c *scrapePerformerByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapePerformerURLScript
} else if c.Action == scraperActionXPath {
c.performScrape = scrapePerformerURLXpath
}
}
type scrapeSceneFragmentFunc func(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error)
type sceneByFragmentConfig struct {
scraperTypeConfig `yaml:",inline"`
performScrape scrapeSceneFragmentFunc
}
func (c *sceneByFragmentConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapeSceneFragmentScript
} else if c.Action == scraperActionStash {
c.performScrape = scrapeSceneFragmentStash
}
}
type scrapeSceneByURLFunc func(c scraperTypeConfig, url string) (*models.ScrapedScene, error)
type scrapeSceneByURLConfig struct {
scrapeByURLConfig `yaml:",inline"`
performScrape scrapeSceneByURLFunc
}
func (c *scrapeSceneByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapeSceneURLScript
} else if c.Action == scraperActionXPath {
c.performScrape = scrapeSceneURLXPath
}
}
type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}
type scraperConfig struct {
ID string
Name string `yaml:"name"`
PerformerByName *performerByNameConfig `yaml:"performerByName"`
PerformerByFragment *performerByFragmentConfig `yaml:"performerByFragment"`
PerformerByURL []*scrapePerformerByURLConfig `yaml:"performerByURL"`
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
DebugOptions *scraperDebugOptions `yaml:"debug"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}
func loadScraperFromYAML(id string, reader io.Reader) (*scraperConfig, error) {
ret := &scraperConfig{}
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
ret := &config{}
parser := yaml.NewDecoder(reader)
parser.SetStrict(true)
@@ -170,13 +147,14 @@ func loadScraperFromYAML(id string, reader io.Reader) (*scraperConfig, error) {
ret.ID = id
// set the scraper interface
ret.initialiseConfigs()
if err := ret.validate(); err != nil {
return nil, err
}
return ret, nil
}
func loadScraperFromYAMLFile(path string) (*scraperConfig, error) {
func loadScraperFromYAMLFile(path string) (*config, error) {
file, err := os.Open(path)
defer file.Close()
if err != nil {
@@ -187,34 +165,17 @@ func loadScraperFromYAMLFile(path string) (*scraperConfig, error) {
id := filepath.Base(path)
id = id[:strings.LastIndex(id, ".")]
return loadScraperFromYAML(id, file)
ret, err := loadScraperFromYAML(id, file)
if err != nil {
return nil, err
}
func (c *scraperConfig) initialiseConfigs() {
if c.PerformerByName != nil {
c.PerformerByName.resolveFn()
c.PerformerByName.scraperConfig = c
}
if c.PerformerByFragment != nil {
c.PerformerByFragment.resolveFn()
c.PerformerByFragment.scraperConfig = c
}
for _, s := range c.PerformerByURL {
s.resolveFn()
s.scraperConfig = c
ret.path = path
return ret, nil
}
if c.SceneByFragment != nil {
c.SceneByFragment.resolveFn()
c.SceneByFragment.scraperConfig = c
}
for _, s := range c.SceneByURL {
s.resolveFn()
s.scraperConfig = c
}
}
func (c scraperConfig) toScraper() *models.Scraper {
func (c config) toScraper() *models.Scraper {
ret := models.Scraper{
ID: c.ID,
Name: c.Name,
@@ -256,11 +217,11 @@ func (c scraperConfig) toScraper() *models.Scraper {
return &ret
}
func (c scraperConfig) supportsPerformers() bool {
func (c config) supportsPerformers() bool {
return c.PerformerByName != nil || c.PerformerByFragment != nil || len(c.PerformerByURL) > 0
}
func (c scraperConfig) matchesPerformerURL(url string) bool {
func (c config) matchesPerformerURL(url string) bool {
for _, scraper := range c.PerformerByURL {
if scraper.matchesURL(url) {
return true
@@ -270,31 +231,34 @@ func (c scraperConfig) matchesPerformerURL(url string) bool {
return false
}
func (c scraperConfig) ScrapePerformerNames(name string) ([]*models.ScrapedPerformer, error) {
if c.PerformerByName != nil && c.PerformerByName.performScrape != nil {
return c.PerformerByName.performScrape(c.PerformerByName.scraperTypeConfig, name)
func (c config) ScrapePerformerNames(name string, globalConfig GlobalConfig) ([]*models.ScrapedPerformer, error) {
if c.PerformerByName != nil {
s := getScraper(*c.PerformerByName, c, globalConfig)
return s.scrapePerformersByName(name)
}
return nil, nil
}
func (c scraperConfig) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
if c.PerformerByFragment != nil && c.PerformerByFragment.performScrape != nil {
return c.PerformerByFragment.performScrape(c.PerformerByFragment.scraperTypeConfig, scrapedPerformer)
func (c config) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
if c.PerformerByFragment != nil {
s := getScraper(*c.PerformerByFragment, c, globalConfig)
return s.scrapePerformerByFragment(scrapedPerformer)
}
// try to match against URL if present
if scrapedPerformer.URL != nil && *scrapedPerformer.URL != "" {
return c.ScrapePerformerURL(*scrapedPerformer.URL)
return c.ScrapePerformerURL(*scrapedPerformer.URL, globalConfig)
}
return nil, nil
}
func (c scraperConfig) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
func (c config) ScrapePerformerURL(url string, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
for _, scraper := range c.PerformerByURL {
if scraper.matchesURL(url) && scraper.performScrape != nil {
ret, err := scraper.performScrape(scraper.scraperTypeConfig, url)
if scraper.matchesURL(url) {
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
ret, err := s.scrapePerformerByURL(url)
if err != nil {
return nil, err
}
@@ -308,11 +272,11 @@ func (c scraperConfig) ScrapePerformerURL(url string) (*models.ScrapedPerformer,
return nil, nil
}
func (c scraperConfig) supportsScenes() bool {
func (c config) supportsScenes() bool {
return c.SceneByFragment != nil || len(c.SceneByURL) > 0
}
func (c scraperConfig) matchesSceneURL(url string) bool {
func (c config) matchesSceneURL(url string) bool {
for _, scraper := range c.SceneByURL {
if scraper.matchesURL(url) {
return true
@@ -322,18 +286,20 @@ func (c scraperConfig) matchesSceneURL(url string) bool {
return false
}
func (c scraperConfig) ScrapeScene(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
if c.SceneByFragment != nil && c.SceneByFragment.performScrape != nil {
return c.SceneByFragment.performScrape(c.SceneByFragment.scraperTypeConfig, scene)
func (c config) ScrapeScene(scene models.SceneUpdateInput, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
if c.SceneByFragment != nil {
s := getScraper(*c.SceneByFragment, c, globalConfig)
return s.scrapeSceneByFragment(scene)
}
return nil, nil
}
func (c scraperConfig) ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
func (c config) ScrapeSceneURL(url string, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
for _, scraper := range c.SceneByURL {
if scraper.matchesURL(url) && scraper.performScrape != nil {
ret, err := scraper.performScrape(scraper.scraperTypeConfig, url)
if scraper.matchesURL(url) {
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
ret, err := s.scrapeSceneByURL(url)
if err != nil {
return nil, err
}

View File

@@ -6,7 +6,8 @@ import (
"github.com/stashapp/stash/pkg/logger"
)
const freeonesScraperID = "builtin_freeones"
// FreeonesScraperID is the scraper ID for the built-in Freeones scraper
const FreeonesScraperID = "builtin_freeones"
// 537: stolen from: https://github.com/stashapp/CommunityScrapers/blob/master/scrapers/FreeonesCommunity.yml
const freeonesScraperConfig = `
@@ -103,10 +104,10 @@ xPathScrapers:
# Last updated June 15, 2020
`
func GetFreeonesScraper() scraperConfig {
func getFreeonesScraper() config {
yml := freeonesScraperConfig
scraper, err := loadScraperFromYAML(freeonesScraperID, strings.NewReader(yml))
scraper, err := loadScraperFromYAML(FreeonesScraperID, strings.NewReader(yml))
if err != nil {
logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error())
}

View File

@@ -6,7 +6,6 @@ import (
"strings"
"time"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
)
@@ -15,13 +14,13 @@ import (
// configurable at some point.
const imageGetTimeout = time.Second * 30
func setPerformerImage(p *models.ScrapedPerformer) error {
func setPerformerImage(p *models.ScrapedPerformer, globalConfig GlobalConfig) error {
if p == nil || p.Image == nil || !strings.HasPrefix(*p.Image, "http") {
// nothing to do
return nil
}
img, err := getImage(*p.Image)
img, err := getImage(*p.Image, globalConfig)
if err != nil {
return err
}
@@ -31,14 +30,14 @@ func setPerformerImage(p *models.ScrapedPerformer) error {
return nil
}
func setSceneImage(s *models.ScrapedScene) error {
func setSceneImage(s *models.ScrapedScene, globalConfig GlobalConfig) error {
// don't try to get the image if it doesn't appear to be a URL
if s == nil || s.Image == nil || !strings.HasPrefix(*s.Image, "http") {
// nothing to do
return nil
}
img, err := getImage(*s.Image)
img, err := getImage(*s.Image, globalConfig)
if err != nil {
return err
}
@@ -48,7 +47,7 @@ func setSceneImage(s *models.ScrapedScene) error {
return nil
}
func getImage(url string) (*string, error) {
func getImage(url string, globalConfig GlobalConfig) (*string, error) {
client := &http.Client{
Timeout: imageGetTimeout,
}
@@ -58,7 +57,7 @@ func getImage(url string) (*string, error) {
return nil, err
}
userAgent := config.GetScraperUserAgent()
userAgent := globalConfig.UserAgent
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
@@ -93,10 +92,10 @@ func getImage(url string) (*string, error) {
return &img, nil
}
func getStashPerformerImage(stashURL string, performerID string) (*string, error) {
return getImage(stashURL + "/performer/" + performerID + "/image")
func getStashPerformerImage(stashURL string, performerID string, globalConfig GlobalConfig) (*string, error) {
return getImage(stashURL+"/performer/"+performerID+"/image", globalConfig)
}
func getStashSceneImage(stashURL string, sceneID string) (*string, error) {
return getImage(stashURL + "/scene/" + sceneID + "/screenshot")
func getStashSceneImage(stashURL string, sceneID string, globalConfig GlobalConfig) (*string, error) {
return getImage(stashURL+"/scene/"+sceneID+"/screenshot", globalConfig)
}

600
pkg/scraper/mapped.go Normal file
View File

@@ -0,0 +1,600 @@
package scraper
import (
"errors"
"fmt"
"reflect"
"regexp"
"strings"
"time"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"gopkg.in/yaml.v2"
)
type mappedQuery interface {
runQuery(selector string) []string
subScrape(value string) mappedQuery
}
type commonMappedConfig map[string]string
type mappedConfig map[string]mappedScraperAttrConfig
func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
if c == nil {
return src
}
ret := src
for commonKey, commonVal := range c {
if strings.Contains(ret, commonKey) {
ret = strings.Replace(ret, commonKey, commonVal, -1)
}
}
return ret
}
func (s mappedConfig) process(q mappedQuery, common commonMappedConfig) mappedResults {
var ret mappedResults
for k, attrConfig := range s {
if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key
const i = 0
ret = ret.setKey(i, k, attrConfig.Fixed)
} else {
selector := attrConfig.Selector
selector = s.applyCommon(common, selector)
found := q.runQuery(selector)
if len(found) > 0 {
result := s.postProcess(q, attrConfig, found)
for i, text := range result {
ret = ret.setKey(i, k, text)
}
}
}
}
return ret
}
func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrConfig, found []string) []string {
// check if we're concatenating the results into a single result
var ret []string
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result, q)
if attrConfig.hasSplit() {
return attrConfig.splitString(result)
}
ret = []string{result}
} else {
for _, text := range found {
text = attrConfig.postProcess(text, q)
if attrConfig.hasSplit() {
return attrConfig.splitString(text)
}
ret = append(ret, text)
}
}
return ret
}
type mappedSceneScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
Performers mappedConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
Movies mappedConfig `yaml:"Movies"`
}
type _mappedSceneScraperConfig mappedSceneScraperConfig
const (
mappedScraperConfigSceneTags = "Tags"
mappedScraperConfigScenePerformers = "Performers"
mappedScraperConfigSceneStudio = "Studio"
mappedScraperConfigSceneMovies = "Movies"
)
func (s *mappedSceneScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
thisMap[mappedScraperConfigSceneMovies] = parentMap[mappedScraperConfigSceneMovies]
delete(parentMap, mappedScraperConfigSceneTags)
delete(parentMap, mappedScraperConfigScenePerformers)
delete(parentMap, mappedScraperConfigSceneStudio)
delete(parentMap, mappedScraperConfigSceneMovies)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedSceneScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedSceneScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedPerformerScraperConfig struct {
mappedConfig
}
func (s *mappedPerformerScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
return unmarshal(&s.mappedConfig)
}
type mappedRegexConfig struct {
Regex string `yaml:"regex"`
With string `yaml:"with"`
}
type mappedRegexConfigs []mappedRegexConfig
func (c mappedRegexConfig) apply(value string) string {
if c.Regex != "" {
re, err := regexp.Compile(c.Regex)
if err != nil {
logger.Warnf("Error compiling regex '%s': %s", c.Regex, err.Error())
return value
}
ret := re.ReplaceAllString(value, c.With)
// trim leading and trailing whitespace
// this is done to maintain backwards compatibility with existing
// scrapers
ret = strings.TrimSpace(ret)
logger.Debugf(`Replace: '%s' with '%s'`, c.Regex, c.With)
logger.Debugf("Before: %s", value)
logger.Debugf("After: %s", ret)
return ret
}
return value
}
func (c mappedRegexConfigs) apply(value string) string {
// apply regex in order
for _, config := range c {
value = config.apply(value)
}
return value
}
type postProcessAction interface {
Apply(value string, q mappedQuery) string
}
type postProcessParseDate string
func (p *postProcessParseDate) Apply(value string, q mappedQuery) string {
parseDate := string(*p)
if parseDate == "" {
return value
}
// try to parse the date using the pattern
// if it fails, then just fall back to the original value
parsedValue, err := time.Parse(parseDate, value)
if err != nil {
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
return value
}
// convert it into our date format
const internalDateFormat = "2006-01-02"
return parsedValue.Format(internalDateFormat)
}
type postProcessReplace mappedRegexConfigs
func (c *postProcessReplace) Apply(value string, q mappedQuery) string {
replace := mappedRegexConfigs(*c)
return replace.apply(value)
}
type postProcessSubScraper mappedScraperAttrConfig
func (p *postProcessSubScraper) Apply(value string, q mappedQuery) string {
subScrapeConfig := mappedScraperAttrConfig(*p)
logger.Debugf("Sub-scraping for: %s", value)
ss := q.subScrape(value)
if ss != nil {
found := ss.runQuery(subScrapeConfig.Selector)
if len(found) > 0 {
// check if we're concatenating the results into a single result
var result string
if subScrapeConfig.hasConcat() {
result = subScrapeConfig.concatenateResults(found)
} else {
result = found[0]
}
result = subScrapeConfig.postProcess(result, ss)
return result
}
}
return ""
}
type postProcessMap map[string]string
func (p *postProcessMap) Apply(value string, q mappedQuery) string {
// return the mapped value if present
m := *p
mapped, ok := m[value]
if ok {
return mapped
}
return value
}
type mappedPostProcessAction struct {
ParseDate string `yaml:"parseDate"`
Replace mappedRegexConfigs `yaml:"replace"`
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
Map map[string]string `yaml:"map"`
}
func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error) {
var found string
var ret postProcessAction
if a.ParseDate != "" {
found = "parseDate"
action := postProcessParseDate(a.ParseDate)
ret = &action
}
if len(a.Replace) > 0 {
if found != "" {
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "replace")
}
found = "replace"
action := postProcessReplace(a.Replace)
ret = &action
}
if a.SubScraper != nil {
if found != "" {
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "subScraper")
}
found = "subScraper"
action := postProcessSubScraper(*a.SubScraper)
ret = &action
}
if a.Map != nil {
if found != "" {
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "map")
}
found = "map"
action := postProcessMap(a.Map)
ret = &action
}
if ret == nil {
return nil, errors.New("invalid post-process action")
}
return ret, nil
}
type mappedScraperAttrConfig struct {
Selector string `yaml:"selector"`
Fixed string `yaml:"fixed"`
PostProcess []mappedPostProcessAction `yaml:"postProcess"`
Concat string `yaml:"concat"`
Split string `yaml:"split"`
postProcessActions []postProcessAction
// deprecated: use PostProcess instead
ParseDate string `yaml:"parseDate"`
Replace mappedRegexConfigs `yaml:"replace"`
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
}
type _mappedScraperAttrConfig mappedScraperAttrConfig
func (c *mappedScraperAttrConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// try unmarshalling into a string first
if err := unmarshal(&c.Selector); err != nil {
// if it's a type error then we try to unmarshall to the full object
if _, ok := err.(*yaml.TypeError); !ok {
return err
}
// unmarshall to full object
// need it as a separate object
t := _mappedScraperAttrConfig{}
if err = unmarshal(&t); err != nil {
return err
}
*c = mappedScraperAttrConfig(t)
}
return c.convertPostProcessActions()
}
func (c *mappedScraperAttrConfig) convertPostProcessActions() error {
// ensure we don't have the old deprecated fields and the new post process field
if len(c.PostProcess) > 0 {
if c.ParseDate != "" || len(c.Replace) > 0 || c.SubScraper != nil {
return errors.New("cannot include postProcess and (parseDate, replace, subScraper) deprecated fields")
}
// convert xpathPostProcessAction actions to postProcessActions
for _, a := range c.PostProcess {
action, err := a.ToPostProcessAction()
if err != nil {
return err
}
c.postProcessActions = append(c.postProcessActions, action)
}
c.PostProcess = nil
} else {
// convert old deprecated fields if present
// in same order as they used to be executed
if len(c.Replace) > 0 {
action := postProcessReplace(c.Replace)
c.postProcessActions = append(c.postProcessActions, &action)
c.Replace = nil
}
if c.SubScraper != nil {
action := postProcessSubScraper(*c.SubScraper)
c.postProcessActions = append(c.postProcessActions, &action)
c.SubScraper = nil
}
if c.ParseDate != "" {
action := postProcessParseDate(c.ParseDate)
c.postProcessActions = append(c.postProcessActions, &action)
c.ParseDate = ""
}
}
return nil
}
func (c mappedScraperAttrConfig) hasConcat() bool {
return c.Concat != ""
}
func (c mappedScraperAttrConfig) hasSplit() bool {
return c.Split != ""
}
func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
separator := c.Concat
result := []string{}
for _, text := range nodes {
result = append(result, text)
}
return strings.Join(result, separator)
}
func (c mappedScraperAttrConfig) splitString(value string) []string {
separator := c.Split
var res []string
if separator == "" {
return []string{value}
}
for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}
return res
}
func (c mappedScraperAttrConfig) postProcess(value string, q mappedQuery) string {
for _, action := range c.postProcessActions {
value = action.Apply(value, q)
}
return value
}
type mappedScrapers map[string]*mappedScraper
type mappedScraper struct {
Common commonMappedConfig `yaml:"common"`
Scene *mappedSceneScraperConfig `yaml:"scene"`
Performer *mappedPerformerScraperConfig `yaml:"performer"`
}
type mappedResult map[string]string
type mappedResults []mappedResult
func (r mappedResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)
// dest should be a pointer
destVal = destVal.Elem()
for key, value := range r {
field := destVal.FieldByName(key)
if field.IsValid() {
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
// need to copy the value, otherwise everything is set to the
// same pointer
localValue := value
reflectValue = reflect.ValueOf(&localValue)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
}
}
func (r mappedResults) setKey(index int, key string, value string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func (s mappedScraper) scrapePerformer(q mappedQuery) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(q, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
}
return &ret, nil
}
func (s mappedScraper) scrapePerformers(q mappedQuery) ([]*models.ScrapedPerformer, error) {
var ret []*models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(q, s.Common)
for _, r := range results {
var p models.ScrapedPerformer
r.apply(&p)
ret = append(ret, &p)
}
return ret, nil
}
func (s mappedScraper) scrapeScene(q mappedQuery) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneScraperConfig := s.Scene
sceneMap := sceneScraperConfig.mappedConfig
if sceneMap == nil {
return nil, nil
}
scenePerformersMap := sceneScraperConfig.Performers
sceneTagsMap := sceneScraperConfig.Tags
sceneStudioMap := sceneScraperConfig.Studio
sceneMoviesMap := sceneScraperConfig.Movies
logger.Debug(`Processing scene:`)
results := sceneMap.process(q, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if scenePerformersMap != nil {
logger.Debug(`Processing scene performers:`)
performerResults := scenePerformersMap.process(q, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedScenePerformer{}
p.apply(performer)
ret.Performers = append(ret.Performers, performer)
}
}
if sceneTagsMap != nil {
logger.Debug(`Processing scene tags:`)
tagResults := sceneTagsMap.process(q, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedSceneTag{}
p.apply(tag)
ret.Tags = append(ret.Tags, tag)
}
}
if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(q, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedSceneStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
if sceneMoviesMap != nil {
logger.Debug(`Processing scene movies:`)
movieResults := sceneMoviesMap.process(q, s.Common)
for _, p := range movieResults {
movie := &models.ScrapedSceneMovie{}
p.apply(movie)
ret.Movies = append(ret.Movies, movie)
}
}
}
return &ret, nil
}

View File

@@ -0,0 +1,31 @@
package scraper
import (
"testing"
"gopkg.in/yaml.v2"
)
func TestInvalidPostProcessAction(t *testing.T) {
yamlStr := `name: Test
performerByURL:
- action: scrapeXPath
scraper: performerScraper
xPathScrapers:
performerScraper:
performer:
Name:
selector: //div/a/@href
postProcess:
- parseDate: Jan 2, 2006
- anything
`
c := &config{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err == nil {
t.Error("expected error unmarshalling with invalid post-process action")
return
}
}

View File

@@ -7,19 +7,42 @@ import (
"strconv"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
var scrapers []scraperConfig
func loadScrapers() ([]scraperConfig, error) {
if scrapers != nil {
return scrapers, nil
// GlobalConfig contains the global scraper options.
type GlobalConfig struct {
// User Agent used when scraping using http.
UserAgent string
Path string
}
path := config.GetScrapersPath()
scrapers = make([]scraperConfig, 0)
// Cache stores scraper details.
type Cache struct {
scrapers []config
globalConfig GlobalConfig
}
// NewCache returns a new Cache loading scraper configurations from the
// scraper path provided in the global config object. It returns a new
// instance and an error if the scraper directory could not be loaded.
//
// Scraper configurations are loaded from yml files in the provided scrapers
// directory and any subdirectories.
func NewCache(globalConfig GlobalConfig) (*Cache, error) {
scrapers, err := loadScrapers(globalConfig.Path)
if err != nil {
return nil, err
}
return &Cache{
globalConfig: globalConfig,
scrapers: scrapers,
}, nil
}
func loadScrapers(path string) ([]config, error) {
scrapers := make([]config, 0)
logger.Debugf("Reading scraper configs from %s", path)
scraperFiles := []string{}
@@ -36,7 +59,7 @@ func loadScrapers() ([]scraperConfig, error) {
}
// add built-in freeones scraper
scrapers = append(scrapers, GetFreeonesScraper())
scrapers = append(scrapers, getFreeonesScraper())
for _, file := range scraperFiles {
scraper, err := loadScraperFromYAMLFile(file)
@@ -50,55 +73,55 @@ func loadScrapers() ([]scraperConfig, error) {
return scrapers, nil
}
func ReloadScrapers() error {
scrapers = nil
_, err := loadScrapers()
// ReloadScrapers clears the scraper cache and reloads from the scraper path.
// In the event of an error during loading, the cache will be left empty.
func (c *Cache) ReloadScrapers() error {
c.scrapers = nil
scrapers, err := loadScrapers(c.globalConfig.Path)
if err != nil {
return err
}
func ListPerformerScrapers() ([]*models.Scraper, error) {
// read scraper config files from the directory and cache
scrapers, err := loadScrapers()
if err != nil {
return nil, err
c.scrapers = scrapers
return nil
}
// UpdateConfig updates the global config for the cache. If the scraper path
// has changed, ReloadScrapers will need to be called separately.
func (c *Cache) UpdateConfig(globalConfig GlobalConfig) {
c.globalConfig = globalConfig
}
// ListPerformerScrapers returns a list of scrapers that are capable of
// scraping performers.
func (c Cache) ListPerformerScrapers() []*models.Scraper {
var ret []*models.Scraper
for _, s := range scrapers {
for _, s := range c.scrapers {
// filter on type
if s.supportsPerformers() {
ret = append(ret, s.toScraper())
}
}
return ret, nil
}
func ListSceneScrapers() ([]*models.Scraper, error) {
// read scraper config files from the directory and cache
scrapers, err := loadScrapers()
if err != nil {
return nil, err
return ret
}
// ListSceneScrapers returns a list of scrapers that are capable of
// scraping scenes.
func (c Cache) ListSceneScrapers() []*models.Scraper {
var ret []*models.Scraper
for _, s := range scrapers {
for _, s := range c.scrapers {
// filter on type
if s.supportsScenes() {
ret = append(ret, s.toScraper())
}
}
return ret, nil
return ret
}
func findScraper(scraperID string) *scraperConfig {
// read scraper config files from the directory and cache
loadScrapers()
for _, s := range scrapers {
func (c Cache) findScraper(scraperID string) *config {
for _, s := range c.scrapers {
if s.ID == scraperID {
return &s
}
@@ -107,27 +130,32 @@ func findScraper(scraperID string) *scraperConfig {
return nil
}
func ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
// ScrapePerformerList uses the scraper with the provided ID to query for
// performers using the provided query string. It returns a list of
// scraped performer data.
func (c Cache) ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
// find scraper with the provided id
s := findScraper(scraperID)
s := c.findScraper(scraperID)
if s != nil {
return s.ScrapePerformerNames(query)
return s.ScrapePerformerNames(query, c.globalConfig)
}
return nil, errors.New("Scraper with ID " + scraperID + " not found")
}
func ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
// ScrapePerformer uses the scraper with the provided ID to scrape a
// performer using the provided performer fragment.
func (c Cache) ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
// find scraper with the provided id
s := findScraper(scraperID)
s := c.findScraper(scraperID)
if s != nil {
ret, err := s.ScrapePerformer(scrapedPerformer)
ret, err := s.ScrapePerformer(scrapedPerformer, c.globalConfig)
if err != nil {
return nil, err
}
// post-process - set the image if applicable
if err := setPerformerImage(ret); err != nil {
if err := setPerformerImage(ret, c.globalConfig); err != nil {
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
}
@@ -137,16 +165,19 @@ func ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerI
return nil, errors.New("Scraper with ID " + scraperID + " not found")
}
func ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
for _, s := range scrapers {
// ScrapePerformerURL uses the first scraper it finds that matches the URL
// provided to scrape a performer. If no scrapers are found that matches
// the URL, then nil is returned.
func (c Cache) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
for _, s := range c.scrapers {
if s.matchesPerformerURL(url) {
ret, err := s.ScrapePerformerURL(url)
ret, err := s.ScrapePerformerURL(url, c.globalConfig)
if err != nil {
return nil, err
}
// post-process - set the image if applicable
if err := setPerformerImage(ret); err != nil {
if err := setPerformerImage(ret, c.globalConfig); err != nil {
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
}
@@ -194,6 +225,7 @@ func matchStudio(s *models.ScrapedSceneStudio) error {
s.ID = &id
return nil
}
func matchMovie(m *models.ScrapedSceneMovie) error {
qb := models.NewMovieQueryBuilder()
@@ -232,7 +264,7 @@ func matchTag(s *models.ScrapedSceneTag) error {
return nil
}
func postScrapeScene(ret *models.ScrapedScene) error {
func (c Cache) postScrapeScene(ret *models.ScrapedScene) error {
for _, p := range ret.Performers {
err := matchPerformer(p)
if err != nil {
@@ -262,25 +294,26 @@ func postScrapeScene(ret *models.ScrapedScene) error {
}
// post-process - set the image if applicable
if err := setSceneImage(ret); err != nil {
if err := setSceneImage(ret, c.globalConfig); err != nil {
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
}
return nil
}
func ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
// ScrapeScene uses the scraper with the provided ID to scrape a scene.
func (c Cache) ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
// find scraper with the provided id
s := findScraper(scraperID)
s := c.findScraper(scraperID)
if s != nil {
ret, err := s.ScrapeScene(scene)
ret, err := s.ScrapeScene(scene, c.globalConfig)
if err != nil {
return nil, err
}
if ret != nil {
err = postScrapeScene(ret)
err = c.postScrapeScene(ret)
if err != nil {
return nil, err
}
@@ -292,16 +325,19 @@ func ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.Scrap
return nil, errors.New("Scraper with ID " + scraperID + " not found")
}
func ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
for _, s := range scrapers {
// ScrapeSceneURL uses the first scraper it finds that matches the URL
// provided to scrape a scene. If no scrapers are found that matches
// the URL, then nil is returned.
func (c Cache) ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
for _, s := range c.scrapers {
if s.matchesSceneURL(url) {
ret, err := s.ScrapeSceneURL(url)
ret, err := s.ScrapeSceneURL(url, c.globalConfig)
if err != nil {
return nil, err
}
err = postScrapeScene(ret)
err = c.postScrapeScene(ret)
if err != nil {
return nil, err
}

View File

@@ -6,16 +6,32 @@ import (
"io"
"io/ioutil"
"os/exec"
"path/filepath"
"strings"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
func runScraperScript(command []string, inString string, out interface{}) error {
type scriptScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper {
return &scriptScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
func (s *scriptScraper) runScraperScript(inString string, out interface{}) error {
command := s.scraper.Script
cmd := exec.Command(command[0], command[1:]...)
cmd.Dir = config.GetScrapersPath()
cmd.Dir = filepath.Dir(s.config.path)
stdin, err := cmd.StdinPipe()
if err != nil {
@@ -65,12 +81,12 @@ func runScraperScript(command []string, inString string, out interface{}) error
return nil
}
func scrapePerformerNamesScript(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
func (s *scriptScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
inString := `{"name": "` + name + `"}`
var performers []models.ScrapedPerformer
err := runScraperScript(c.Script, inString, &performers)
err := s.runScraperScript(inString, &performers)
// convert to pointers
var ret []*models.ScrapedPerformer
@@ -83,7 +99,7 @@ func scrapePerformerNamesScript(c scraperTypeConfig, name string) ([]*models.Scr
return ret, err
}
func scrapePerformerFragmentScript(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
func (s *scriptScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
inString, err := json.Marshal(scrapedPerformer)
if err != nil {
@@ -92,22 +108,22 @@ func scrapePerformerFragmentScript(c scraperTypeConfig, scrapedPerformer models.
var ret models.ScrapedPerformer
err = runScraperScript(c.Script, string(inString), &ret)
err = s.runScraperScript(string(inString), &ret)
return &ret, err
}
func scrapePerformerURLScript(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
func (s *scriptScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
inString := `{"url": "` + url + `"}`
var ret models.ScrapedPerformer
err := runScraperScript(c.Script, string(inString), &ret)
err := s.runScraperScript(string(inString), &ret)
return &ret, err
}
func scrapeSceneFragmentScript(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
func (s *scriptScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
inString, err := json.Marshal(scene)
if err != nil {
@@ -116,17 +132,17 @@ func scrapeSceneFragmentScript(c scraperTypeConfig, scene models.SceneUpdateInpu
var ret models.ScrapedScene
err = runScraperScript(c.Script, string(inString), &ret)
err = s.runScraperScript(string(inString), &ret)
return &ret, err
}
func scrapeSceneURLScript(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
func (s *scriptScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
inString := `{"url": "` + url + `"}`
var ret models.ScrapedScene
err := runScraperScript(c.Script, string(inString), &ret)
err := s.runScraperScript(string(inString), &ret)
return &ret, err
}

View File

@@ -2,6 +2,7 @@ package scraper
import (
"context"
"errors"
"strconv"
"github.com/jinzhu/copier"
@@ -10,8 +11,22 @@ import (
"github.com/stashapp/stash/pkg/models"
)
func getStashClient(c scraperTypeConfig) *graphql.Client {
url := c.scraperConfig.StashServer.URL
type stashScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
func newStashScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *stashScraper {
return &stashScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
func (s *stashScraper) getStashClient() *graphql.Client {
url := s.config.StashServer.URL
return graphql.NewClient(url+"/graphql", nil)
}
@@ -33,8 +48,8 @@ type stashFindPerformerNamesResultType struct {
Performers []*stashFindPerformerNamePerformer `graphql:"performers"`
}
func scrapePerformerNamesStash(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
client := getStashClient(c)
func (s *stashScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
client := s.getStashClient()
var q struct {
FindPerformers stashFindPerformerNamesResultType `graphql:"findPerformers(filter: $f)"`
@@ -64,8 +79,8 @@ func scrapePerformerNamesStash(c scraperTypeConfig, name string) ([]*models.Scra
return ret, nil
}
func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
client := getStashClient(c)
func (s *stashScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
client := s.getStashClient()
var q struct {
FindPerformer *models.ScrapedPerformerStash `graphql:"findPerformer(id: $f)"`
@@ -91,7 +106,7 @@ func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.S
}
// get the performer image directly
ret.Image, err = getStashPerformerImage(c.scraperConfig.StashServer.URL, performerID)
ret.Image, err = getStashPerformerImage(s.config.StashServer.URL, performerID, s.globalConfig)
if err != nil {
return nil, err
}
@@ -99,7 +114,7 @@ func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.S
return &ret, nil
}
func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
func (s *stashScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
// query by MD5
// assumes that the scene exists in the database
qb := models.NewSceneQueryBuilder()
@@ -123,7 +138,7 @@ func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput
"c": &checksum,
}
client := getStashClient(c)
client := s.getStashClient()
err = client.Query(context.Background(), &q, vars)
if err != nil {
return nil, err
@@ -152,10 +167,18 @@ func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput
}
// get the performer image directly
ret.Image, err = getStashSceneImage(c.scraperConfig.StashServer.URL, q.FindScene.ID)
ret.Image, err = getStashSceneImage(s.config.StashServer.URL, q.FindScene.ID, s.globalConfig)
if err != nil {
return nil, err
}
return &ret, nil
}
func (s *stashScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
return nil, errors.New("scrapePerformerByURL not supported for stash scraper")
}
func (s *stashScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
return nil, errors.New("scrapeSceneByURL not supported for stash scraper")
}

View File

@@ -6,7 +6,6 @@ import (
"net/http"
"net/http/cookiejar"
"net/url"
"reflect"
"regexp"
"strings"
"time"
@@ -17,7 +16,6 @@ import (
"golang.org/x/net/publicsuffix"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
@@ -25,572 +23,94 @@ import (
// configurable at some point.
const scrapeGetTimeout = time.Second * 30
var debugMode = false
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
ret := src
for commonKey, commonVal := range c {
if strings.Contains(ret, commonKey) {
ret = strings.Replace(ret, commonKey, commonVal, -1)
}
}
return ret
}
type xpathScraperConfig map[string]interface{}
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
ret := make(xpathScraperConfig)
if src != nil {
for k, v := range src {
keyStr, isStr := k.(string)
if isStr {
ret[keyStr] = v
}
}
}
return ret
}
type xpathRegexConfig map[interface{}]interface{}
type xpathRegexConfigs []xpathRegexConfig
func (c xpathRegexConfig) apply(value string) string {
regex := ""
with := ""
if regexI, _ := c["regex"]; regexI != nil {
regex, _ = regexI.(string)
}
if withI, _ := c["with"]; withI != nil {
with, _ = withI.(string)
}
if regex != "" {
re, err := regexp.Compile(regex)
if err != nil {
logger.Warnf("Error compiling regex '%s': %s", regex, err.Error())
return value
}
ret := re.ReplaceAllString(value, with)
// replace lines if needed to protect from commonPostprocess
if with == "\n" {
ret = replaceLines(ret)
}
logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
logger.Debugf("Before: %s", value)
logger.Debugf("After: %s", ret)
return ret
}
return value
}
func (c xpathRegexConfigs) apply(value string) string {
// apply regex in order
for _, config := range c {
value = config.apply(value)
}
// remove whitespace again
value = commonPostProcess(value)
// restore replaced lines
value = restoreLines(value)
return value
}
type xpathScraperAttrConfig map[interface{}]interface{}
func (c xpathScraperAttrConfig) getString(key string) string {
ret, _ := c[key]
if ret == nil {
return ""
}
asStr, _ := ret.(string)
return asStr
}
func (c xpathScraperAttrConfig) getSelector() string {
const selectorKey = "selector"
return c.getString(selectorKey)
}
func (c xpathScraperAttrConfig) getConcat() string {
const concatKey = "concat"
return c.getString(concatKey)
}
func (c xpathScraperAttrConfig) hasConcat() bool {
return c.getConcat() != ""
}
func (c xpathScraperAttrConfig) getParseDate() string {
const parseDateKey = "parseDate"
return c.getString(parseDateKey)
}
func (c xpathScraperAttrConfig) getSplit() string {
const splitKey = "split"
return c.getString(splitKey)
}
func (c xpathScraperAttrConfig) hasSplit() bool {
return c.getSplit() != ""
}
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
const replaceKey = "replace"
val, _ := c[replaceKey]
var ret xpathRegexConfigs
if val == nil {
return ret
}
asSlice, _ := val.([]interface{})
for _, v := range asSlice {
asMap, _ := v.(map[interface{}]interface{})
ret = append(ret, xpathRegexConfig(asMap))
}
return ret
}
func (c xpathScraperAttrConfig) getSubScraper() xpathScraperAttrConfig {
const subScraperKey = "subScraper"
val, _ := c[subScraperKey]
if val == nil {
return nil
}
asMap, _ := val.(map[interface{}]interface{})
if asMap != nil {
return xpathScraperAttrConfig(asMap)
}
return nil
}
func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string {
separator := c.getConcat()
result := []string{}
for _, elem := range nodes {
text := NodeText(elem)
text = commonPostProcess(text)
result = append(result, text)
}
return strings.Join(result, separator)
}
func (c xpathScraperAttrConfig) parseDate(value string) string {
parseDate := c.getParseDate()
if parseDate == "" {
return value
}
// try to parse the date using the pattern
// if it fails, then just fall back to the original value
parsedValue, err := time.Parse(parseDate, value)
if err != nil {
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
return value
}
// convert it into our date format
const internalDateFormat = "2006-01-02"
return parsedValue.Format(internalDateFormat)
}
func (c xpathScraperAttrConfig) splitString(value string) []string {
separator := c.getSplit()
var res []string
if separator == "" {
return []string{value}
}
for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}
return res
}
// setKeyAndSplit sets the key "k" for the results "ret" and splits if needed
// "i" is the index starting position
func (c xpathScraperAttrConfig) setKeyAndSplit(ret *xPathResults, value string, k string, i int) {
if c.hasSplit() {
for j, txt := range c.splitString(value) {
*ret = ret.setKey(j+i, k, txt)
}
} else {
*ret = ret.setKey(i, k, value)
}
}
func (c xpathScraperAttrConfig) replaceRegex(value string) string {
replace := c.getReplace()
return replace.apply(value)
}
func (c xpathScraperAttrConfig) applySubScraper(value string) string {
subScraper := c.getSubScraper()
if subScraper == nil {
return value
}
logger.Debugf("Sub-scraping for: %s", value)
doc, err := loadURL(value)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return ""
}
found := runXPathQuery(doc, subScraper.getSelector(), nil)
if len(found) > 0 {
// check if we're concatenating the results into a single result
var result string
if subScraper.hasConcat() {
result = subScraper.concatenateResults(found)
} else {
result = NodeText(found[0])
result = commonPostProcess(result)
}
result = subScraper.postProcess(result)
return result
}
return ""
}
func (c xpathScraperAttrConfig) postProcess(value string) string {
// perform regex replacements first
value = c.replaceRegex(value)
value = c.applySubScraper(value)
value = c.parseDate(value)
return value
}
func commonPostProcess(value string) string {
value = strings.TrimSpace(value)
// remove multiple whitespace and end lines
re := regexp.MustCompile("\n")
value = re.ReplaceAllString(value, "")
re = regexp.MustCompile(" +")
value = re.ReplaceAllString(value, " ")
return value
}
// func replaceLines replaces all newlines ("\n") with alert ("\a")
func replaceLines(value string) string {
re := regexp.MustCompile("\a") // \a shouldn't exist in the string
value = re.ReplaceAllString(value, "") // remove it
re = regexp.MustCompile("\n") // replace newlines with (\a)'s so that they don't get removed by commonPostprocess
value = re.ReplaceAllString(value, "\a")
return value
}
// func restoreLines replaces all alerts ("\a") with newlines ("\n")
func restoreLines(value string) string {
re := regexp.MustCompile("\a")
value = re.ReplaceAllString(value, "\n")
return value
}
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
// apply common
if common != nil {
xpath = common.applyCommon(xpath)
}
found, err := htmlquery.QueryAll(doc, xpath)
if err != nil {
logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error())
return nil
}
return found
}
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults {
var ret xPathResults
for k, value := range s {
switch v := value.(type) {
case string:
found := runXPathQuery(doc, v, common)
if len(found) > 0 {
for i, elem := range found {
text := NodeText(elem)
text = commonPostProcess(text)
ret = ret.setKey(i, k, text)
}
}
case map[interface{}]interface{}:
attrConfig := xpathScraperAttrConfig(v)
found := runXPathQuery(doc, attrConfig.getSelector(), common)
if len(found) > 0 {
// check if we're concatenating the results into a single result
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result)
attrConfig.setKeyAndSplit(&ret, result, k, 0)
} else {
for i, elem := range found {
text := NodeText(elem)
text = commonPostProcess(text)
text = attrConfig.postProcess(text)
attrConfig.setKeyAndSplit(&ret, text, k, i)
}
}
}
}
}
return ret
}
type xpathScrapers map[string]*xpathScraper
type xpathScraper struct {
Common commonXPathConfig `yaml:"common"`
Scene xpathScraperConfig `yaml:"scene"`
Performer xpathScraperConfig `yaml:"performer"`
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
const (
XPathScraperConfigSceneTags = "Tags"
XPathScraperConfigScenePerformers = "Performers"
XPathScraperConfigSceneStudio = "Studio"
XPathScraperConfigSceneMovies = "Movies"
)
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
// exclude the complex sub-configs
ret := make(xpathScraperConfig)
mapped := s.Scene
if mapped != nil {
for k, v := range mapped {
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio && k != XPathScraperConfigSceneMovies {
ret[k] = v
}
}
}
return ret
}
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
var ret map[interface{}]interface{}
mapped := s.Scene
if mapped != nil {
v, ok := mapped[key]
if ok {
ret, _ = v.(map[interface{}]interface{})
func newXpathScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *xpathScraper {
return &xpathScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
if ret != nil {
return createXPathScraperConfig(ret)
func (s *xpathScraper) getXpathScraper() *mappedScraper {
return s.config.XPathScrapers[s.scraper.Scraper]
}
return nil
func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
doc, err := s.loadURL(url)
if err != nil {
return nil, nil, err
}
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneTags)
return doc, scraper, nil
}
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}
func (s xpathScraper) GetSceneMovies() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneMovies)
q := s.getXPathQuery(doc)
return scraper.scrapePerformer(q)
}
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}
results := performerMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
return &ret, nil
func (s *xpathScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) {
var ret []*models.ScrapedPerformer
const placeholder = "{}"
performerMap := s.Performer
if performerMap == nil {
return nil, nil
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url = strings.Replace(url, placeholder, escapedName, -1)
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
results := performerMap.process(doc, s.Common)
for _, r := range results {
var p models.ScrapedPerformer
r.apply(&p)
ret = append(ret, &p)
q := s.getXPathQuery(doc)
return scraper.scrapePerformers(q)
}
return ret, nil
func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return nil, errors.New("scrapePerformerByFragment not supported for xpath scraper")
}
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneMap := s.GetSceneSimple()
if sceneMap == nil {
return nil, nil
func (s *xpathScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
return nil, errors.New("scrapeSceneByFragment not supported for xpath scraper")
}
scenePerformersMap := s.GetScenePerformers()
sceneTagsMap := s.GetSceneTags()
sceneStudioMap := s.GetSceneStudio()
sceneMoviesMap := s.GetSceneMovies()
logger.Debug(`Processing scene:`)
results := sceneMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if scenePerformersMap != nil {
logger.Debug(`Processing scene performers:`)
performerResults := scenePerformersMap.process(doc, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedScenePerformer{}
p.apply(performer)
ret.Performers = append(ret.Performers, performer)
}
}
if sceneTagsMap != nil {
logger.Debug(`Processing scene tags:`)
tagResults := sceneTagsMap.process(doc, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedSceneTag{}
p.apply(tag)
ret.Tags = append(ret.Tags, tag)
}
}
if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(doc, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedSceneStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
if sceneMoviesMap != nil {
logger.Debug(`Processing scene movies:`)
movieResults := sceneMoviesMap.process(doc, s.Common)
for _, p := range movieResults {
movie := &models.ScrapedSceneMovie{}
p.apply(movie)
ret.Movies = append(ret.Movies, movie)
}
}
}
return &ret, nil
}
type xPathResult map[string]string
type xPathResults []xPathResult
func (r xPathResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)
// dest should be a pointer
destVal = destVal.Elem()
for key, value := range r {
field := destVal.FieldByName(key)
if field.IsValid() {
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
// need to copy the value, otherwise everything is set to the
// same pointer
localValue := value
reflectValue = reflect.ValueOf(&localValue)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
}
}
func (r xPathResults) setKey(index int, key string, value string) xPathResults {
if index >= len(r) {
r = append(r, make(xPathResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func loadURL(url string) (*html.Node, error) {
func (s *xpathScraper) loadURL(url string) (*html.Node, error) {
options := cookiejar.Options{
PublicSuffixList: publicsuffix.List,
}
@@ -615,7 +135,7 @@ func loadURL(url string) (*html.Node, error) {
return nil, err
}
userAgent := config.GetScraperUserAgent()
userAgent := s.globalConfig.UserAgent
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
@@ -633,7 +153,7 @@ func loadURL(url string) (*html.Node, error) {
ret, err := html.Parse(r)
if err == nil && debugMode {
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
@@ -642,73 +162,65 @@ func loadURL(url string) (*html.Node, error) {
return ret, err
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
return &xpathQuery{
doc: doc,
scraper: s,
}
}
if c.scraperConfig != nil && c.scraperConfig.DebugOptions != nil && c.scraperConfig.DebugOptions.PrintHTML {
debugMode = true
type xpathQuery struct {
doc *html.Node
scraper *xpathScraper
}
doc, err := loadURL(url)
func (q *xpathQuery) runQuery(selector string) []string {
found, err := htmlquery.QueryAll(q.doc, selector)
if err != nil {
return nil, err
logger.Warnf("Error parsing xpath expression '%s': %s", selector, err.Error())
return nil
}
return scraper.scrapePerformer(doc)
var ret []string
for _, n := range found {
// don't add empty strings
nodeText := q.nodeText(n)
if nodeText != "" {
ret = append(ret, q.nodeText(n))
}
}
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
return ret
}
if c.scraperConfig != nil && c.scraperConfig.DebugOptions != nil && c.scraperConfig.DebugOptions.PrintHTML {
debugMode = true
}
doc, err := loadURL(url)
if err != nil {
return nil, err
}
return scraper.scrapeScene(doc)
}
func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
const placeholder = "{}"
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)
doc, err := loadURL(u)
if err != nil {
return nil, err
}
return scraper.scrapePerformers(doc)
}
func NodeText(n *html.Node) string {
func (q *xpathQuery) nodeText(n *html.Node) string {
var ret string
if n != nil && n.Type == html.CommentNode {
return htmlquery.OutputHTML(n, true)
ret = htmlquery.OutputHTML(n, true)
}
return htmlquery.InnerText(n)
ret = htmlquery.InnerText(n)
// trim all leading and trailing whitespace
ret = strings.TrimSpace(ret)
// remove multiple whitespace
re := regexp.MustCompile(" +")
ret = re.ReplaceAllString(ret, " ")
// TODO - make this optional
re = regexp.MustCompile("\n")
ret = re.ReplaceAllString(ret, "")
return ret
}
func (q *xpathQuery) subScrape(value string) mappedQuery {
doc, err := q.scraper.loadURL(value)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return nil
}
return q.scraper.getXPathQuery(doc)
}

View File

@@ -1,11 +1,15 @@
package scraper
import (
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
"github.com/antchfx/htmlquery"
"github.com/stashapp/stash/pkg/models"
"github.com/stretchr/testify/assert"
"gopkg.in/yaml.v2"
)
@@ -183,49 +187,79 @@ func makeCommonXPath(attr string) string {
return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
}
func makeReplaceRegex(regex string, with string) map[interface{}]interface{} {
ret := make(map[interface{}]interface{})
func makeSimpleAttrConfig(str string) mappedScraperAttrConfig {
return mappedScraperAttrConfig{
Selector: str,
}
}
func makeReplaceRegex(regex string, with string) mappedRegexConfig {
ret := mappedRegexConfig{
Regex: regex,
With: with,
}
ret["regex"] = regex
ret["with"] = with
return ret
}
func makeXPathConfig() xpathScraperConfig {
config := make(xpathScraperConfig)
func makeXPathConfig() mappedPerformerScraperConfig {
config := mappedPerformerScraperConfig{
mappedConfig: make(mappedConfig),
}
config["Name"] = makeCommonXPath("Babe Name:") + `/a`
config["Ethnicity"] = makeCommonXPath("Ethnicity:")
config["Country"] = makeCommonXPath("Country of Origin:")
config["Aliases"] = makeCommonXPath("Aliases:")
config["EyeColor"] = makeCommonXPath("Eye Color:")
config["Measurements"] = makeCommonXPath("Measurements:")
config["FakeTits"] = makeCommonXPath("Fake boobs:")
config["Height"] = makeCommonXPath("Height:")
config["Tattoos"] = makeCommonXPath("Tattoos:")
config["Piercings"] = makeCommonXPath("Piercings:")
config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`)
config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:"))
config.mappedConfig["Country"] = makeSimpleAttrConfig(makeCommonXPath("Country of Origin:"))
config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:"))
config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:"))
config.mappedConfig["Measurements"] = makeSimpleAttrConfig(makeCommonXPath("Measurements:"))
config.mappedConfig["FakeTits"] = makeSimpleAttrConfig(makeCommonXPath("Fake boobs:"))
config.mappedConfig["Height"] = makeSimpleAttrConfig(makeCommonXPath("Height:"))
config.mappedConfig["Tattoos"] = makeSimpleAttrConfig(makeCommonXPath("Tattoos:"))
config.mappedConfig["Piercings"] = makeSimpleAttrConfig(makeCommonXPath("Piercings:"))
// special handling for birthdate
birthdateAttrConfig := make(map[interface{}]interface{})
birthdateAttrConfig["selector"] = makeCommonXPath("Date of Birth:")
birthdateAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Date of Birth:"))
var birthdateReplace []interface{}
var birthdateReplace mappedRegexConfigs
// make this leave the trailing space to test existing scrapers that do so
birthdateReplace = append(birthdateReplace, makeReplaceRegex(`\(.* years old\)`, ""))
birthdateAttrConfig["replace"] = birthdateReplace
birthdateAttrConfig["parseDate"] = "January 2, 2006" // "July 1, 1992 (27 years old) "
config["Birthdate"] = birthdateAttrConfig
birthdateReplaceAction := postProcessReplace(birthdateReplace)
birthdateParseDate := postProcessParseDate("January 2, 2006") // "July 1, 1992 (27 years old) "
birthdateAttrConfig.postProcessActions = []postProcessAction{
&birthdateReplaceAction,
&birthdateParseDate,
}
config.mappedConfig["Birthdate"] = birthdateAttrConfig
// special handling for career length
careerLengthAttrConfig := make(map[interface{}]interface{})
// no colon in attribute header
careerLengthAttrConfig["selector"] = makeCommonXPath("Career Start And End")
careerLengthAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Career Start And End"))
var careerLengthReplace []interface{}
var careerLengthReplace mappedRegexConfigs
careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, ""))
careerLengthAttrConfig["replace"] = careerLengthReplace
careerLengthReplaceAction := postProcessReplace(careerLengthReplace)
careerLengthAttrConfig.postProcessActions = []postProcessAction{
&careerLengthReplaceAction,
}
config["CareerLength"] = careerLengthAttrConfig
config.mappedConfig["CareerLength"] = careerLengthAttrConfig
// use map post-process action for gender
genderConfig := makeSimpleAttrConfig(makeCommonXPath("Profession:"))
genderMapAction := make(postProcessMap)
genderMapAction["Porn Star"] = "Female"
genderConfig.postProcessActions = []postProcessAction{
&genderMapAction,
}
config.mappedConfig["Gender"] = genderConfig
// use fixed for height
config.mappedConfig["Height"] = mappedScraperAttrConfig{
Fixed: "1234",
}
return config
}
@@ -253,11 +287,15 @@ func TestScrapePerformerXPath(t *testing.T) {
xpathConfig := makeXPathConfig()
scraper := xpathScraper{
Performer: xpathConfig,
scraper := mappedScraper{
Performer: &xpathConfig,
}
performer, err := scraper.scrapePerformer(doc)
q := &xpathQuery{
doc: doc,
}
performer, err := scraper.scrapePerformer(q)
if err != nil {
t.Errorf("Error scraping performer: %s", err.Error())
@@ -274,8 +312,11 @@ func TestScrapePerformerXPath(t *testing.T) {
const fakeTits = "No"
const careerLength = "2012 - 2019"
const tattoosPiercings = "None"
const gender = "Female"
const height = "1234"
verifyField(t, performerName, performer.Name, "Name")
verifyField(t, gender, performer.Gender, "Gender")
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
verifyField(t, country, performer.Country, "Country")
@@ -290,6 +331,7 @@ func TestScrapePerformerXPath(t *testing.T) {
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
verifyField(t, height, performer.Height, "Piercings")
}
func TestConcatXPath(t *testing.T) {
@@ -313,18 +355,25 @@ func TestConcatXPath(t *testing.T) {
return
}
xpathConfig := make(xpathScraperConfig)
nameAttrConfig := make(map[interface{}]interface{})
nameAttrConfig["selector"] = "//div"
nameAttrConfig["concat"] = separator
xpathConfig := make(mappedConfig)
nameAttrConfig := mappedScraperAttrConfig{
Selector: "//div",
Concat: separator,
}
xpathConfig["Name"] = nameAttrConfig
xpathConfig["EyeColor"] = "//span"
xpathConfig["EyeColor"] = makeSimpleAttrConfig("//span")
scraper := xpathScraper{
Performer: xpathConfig,
scraper := mappedScraper{
Performer: &mappedPerformerScraperConfig{
mappedConfig: xpathConfig,
},
}
performer, err := scraper.scrapePerformer(doc)
q := &xpathQuery{
doc: doc,
}
performer, err := scraper.scrapePerformer(q)
if err != nil {
t.Errorf("Error scraping performer: %s", err.Error())
@@ -342,50 +391,19 @@ const sceneHTML = `
<head>
<title>Test Video - Pornhub.com</title>
<meta property="og:title" content="Test Video" />
<meta property="og:description"
content="Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you&#039;re craving 3some XXX movies you&#039;ll find them here." />
<meta property="og:image"
content="https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg" />
<script type="application/ld+json">
{
"@context": "http://schema.org/",
"@type": "VideoObject",
"name": "Test Video",
"embedUrl": "https://www.pornhub.com/embed/ph5da270596459c",
"duration": "PT00H33M27S",
"thumbnailUrl": "https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg",
"uploadDate": "2019-10-13T00:33:51+00:00",
"description": "Watch Test Video on Pornhub&period;com&comma; the best hardcore porn site&period; Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars&period; If you&apos;re craving 3some XXX movies you&apos;ll find them here&period;",
"author" : "Mia Malkova", "interactionStatistic": [
{
"@type": "InteractionCounter",
"interactionType": "http://schema.org/WatchAction",
"userInteractionCount": "5,908,861"
},
{
"@type": "InteractionCounter",
"interactionType": "http://schema.org/LikeAction",
"userInteractionCount": "22,090"
}
]
"author" : "Mia Malkova"
}
</script>
</head>
<body class="logged-out">
<div class="container ">
<div id="main-container" class="clearfix" data-delete-check="1" data-is-private="1" data-is-premium=""
data-liu="0" data-next-shuffle="ph5da270596459c" data-pkey="" data-platform-pc="1" data-playlist-check="0"
data-playlist-id-check="0" data-playlist-geo-check="0" data-friend="0" data-playlist-user-check="0"
data-playlist-video-check="0" data-playlist-shuffle="0" data-shuffle-forward="ph5da270596459c"
data-shuffle-back="ph5da270596459c" data-min-large="1350"
data-video-title="Test Video">
<div id="main-container" class="clearfix">
<div id="vpContentContainer">
<div id="hd-leftColVideoPage">
<div class="video-wrapper">
@@ -402,45 +420,27 @@ const sceneHTML = `
<div class="video-detailed-info">
<div class="video-info-row">
From:&nbsp;
<div class="usernameWrap clearfix" data-type="channel" data-userid="492538092"
data-liu-user="0"
data-json-url="/user/box?id=492538092&amp;token=MTU3NzA1NTkzNIqATol8v_WrhmNTXkeflvG09C2U7UUT_NyoZUFa7iKq0mlzBkmdgAH1aNHZkJmIOHbbwmho1BehHDoA63K5Wn4."
data-disable-popover="0">
<div class="usernameWrap clearfix" data-type="channel">
<a rel="" href="/channels/sis-loves-me" class="bolded">Sis Loves Me</a>
<div class="avatarPosition"></div>
</div>
<span class="verified-icon flag tooltipTrig"
data-title="Verified member"></span>
- 87 videos
<span class="subscribers-count">&nbsp;459466</span>
</div>
<div class="video-info-row">
<div class="pornstarsWrapper">
Pornstars:&nbsp;
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Alex D" data-id="251341" data-login="1"
href="/pornstar/alex-d">Alex D <span
class="psbox-link-container display-none"></span>
data-mxptext="Alex D" href="/pornstar/alex-d">Alex D
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Mia Malkova" data-id="10641" data-login="1"
href="/pornstar/mia-malkova">Mia Malkova <span
class="psbox-link-container display-none"></span>
data-mxptext="Mia Malkova" href="/pornstar/mia-malkova">
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Riley Reid" data-id="5343" data-login="1"
href="/pornstar/riley-reid">Riley Reid <span
class="psbox-link-container display-none"></span>
data-mxptext="Riley Reid" href="/pornstar/riley-reid">Riley Reid
</a>
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
<a class="add-btn-small add-pornstar-btn-2">+
<span>Suggest</span></a>
</div>
<div id="deletePornstarResult" class="suggest-result"></div>
</div>
</div>
@@ -475,14 +475,6 @@ const sceneHTML = `
</div>
</div>
<div class="video-info-row showLess">
<div class="productionWrapper">
Production:&nbsp;
<a href="/video?p=professional" rel="nofollow"
class="production">professional</a>
</div>
</div>
<div class="video-info-row showLess">
<div class="tagsWrapper">
Tags:&nbsp;
@@ -510,121 +502,6 @@ const sceneHTML = `
</div>
</div>
</div>
<div class="video-info-row showLess">
Added on: <span class="white">2 months ago</span>
</div>
<div class="video-info-row showLess">
Featured on: <span class="white">1 month ago</span>
</div>
</div>
</div>
<div class="video-action-tab jump-to-tab">
<div class="title">Jump to your favorite action</div>
<div class="filters mainFilter float-right">
<div class="dropdownTrigger">
<div>
<span class="textFilter" id="tagSort">Sequence</span>
<span class="arrowFilters"></span>
</div>
<ul class="filterListItem dropdownWrapper">
<li class="active"><a class="actionTagSort"
data-sort="seconds">Sequence</a></li>
<li><a class="actionTagSort" data-sort="tag">Alphabetical</a></li>
</ul>
</div>
</div>
<div class="reset"></div>
<div class="display-grid col-4 gap-row-none sortBy seconds">
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
Blowjob </a>
&nbsp;
<var>14:22</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
Reverse Cowgirl </a>
&nbsp;
<var>18:37</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>19:42</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>27:05</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
Doggystyle </a>
&nbsp;
<var>30:22</var>
</li>
</ul>
</div>
<div class="display-grid col-4 gap-row-none sortBy tag">
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
Blowjob </a>
&nbsp;
<var>14:22</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
Reverse Cowgirl </a>
&nbsp;
<var>18:37</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>19:42</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>27:05</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
Doggystyle </a>
&nbsp;
<var>30:22</var>
</li>
</ul>
</div>
</div>
</div>
@@ -637,42 +514,45 @@ const sceneHTML = `
</body>
</html>`
func makeSceneXPathConfig() xpathScraper {
common := make(commonXPathConfig)
func makeSceneXPathConfig() mappedScraper {
common := make(commonMappedConfig)
common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]`
common["$studioElem"] = `//div[@data-type="channel"]/a`
config := make(xpathScraperConfig)
config := mappedSceneScraperConfig{
mappedConfig: make(mappedConfig),
}
config["Title"] = `//meta[@property="og:title"]/@content`
config.mappedConfig["Title"] = makeSimpleAttrConfig(`//meta[@property="og:title"]/@content`)
// this needs post-processing
config["Date"] = `//script[@type="application/ld+json"]`
config.mappedConfig["Date"] = makeSimpleAttrConfig(`//script[@type="application/ld+json"]`)
tagConfig := make(map[interface{}]interface{})
tagConfig["Name"] = `//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`
config["Tags"] = tagConfig
tagConfig := make(mappedConfig)
tagConfig["Name"] = makeSimpleAttrConfig(`//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`)
config.Tags = tagConfig
performerConfig := make(map[interface{}]interface{})
performerConfig["Name"] = `$performerElem/@data-mxptext`
performerConfig["URL"] = `$performerElem/@href`
config["Performers"] = performerConfig
performerConfig := make(mappedConfig)
performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`)
performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`)
config.Performers = performerConfig
studioConfig := make(map[interface{}]interface{})
studioConfig["Name"] = `$studioElem`
studioConfig["URL"] = `$studioElem/@href`
config["Studio"] = studioConfig
studioConfig := make(mappedConfig)
studioConfig["Name"] = makeSimpleAttrConfig(`$studioElem`)
studioConfig["URL"] = makeSimpleAttrConfig(`$studioElem/@href`)
config.Studio = studioConfig
const sep = " "
moviesNameConfig := make(map[interface{}]interface{})
moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title`
moviesNameConfig["split"] = sep
moviesConfig := make(map[interface{}]interface{})
moviesNameConfig := mappedScraperAttrConfig{
Selector: `//i[@class="isMe tooltipTrig"]/@data-title`,
Split: sep,
}
moviesConfig := make(mappedConfig)
moviesConfig["Name"] = moviesNameConfig
config["Movies"] = moviesConfig
config.Movies = moviesConfig
scraper := xpathScraper{
Scene: config,
scraper := mappedScraper{
Scene: &config,
Common: common,
}
@@ -764,7 +644,10 @@ func TestApplySceneXPathConfig(t *testing.T) {
scraper := makeSceneXPathConfig()
scene, err := scraper.scrapeScene(doc)
q := &xpathQuery{
doc: doc,
}
scene, err := scraper.scrapeScene(q)
if err != nil {
t.Errorf("Error scraping scene: %s", err.Error())
@@ -831,21 +714,49 @@ xPathScrapers:
performerScraper:
performer:
name: //h1[@itemprop="name"]
sceneScraper:
scene:
Title:
selector: //title
postProcess:
- parseDate: January 2, 2006
Tags:
Name: //tags
Movies:
Name: //movies
Performers:
Name: //performers
Studio:
Name: //studio
`
config := &scraperConfig{}
err := yaml.Unmarshal([]byte(yamlStr), &config)
c := &config{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err != nil {
t.Errorf("Error loading yaml: %s", err.Error())
return
}
// ensure fields are filled in correctly
sceneScraper := c.XPathScrapers["sceneScraper"]
sceneConfig := sceneScraper.Scene
assert.Equal(t, "//title", sceneConfig.mappedConfig["Title"].Selector)
assert.Equal(t, "//tags", sceneConfig.Tags["Name"].Selector)
assert.Equal(t, "//movies", sceneConfig.Movies["Name"].Selector)
assert.Equal(t, "//performers", sceneConfig.Performers["Name"].Selector)
assert.Equal(t, "//studio", sceneConfig.Studio["Name"].Selector)
postProcess := sceneConfig.mappedConfig["Title"].postProcessActions
parseDate := postProcess[0].(*postProcessParseDate)
assert.Equal(t, "January 2, 2006", string(*parseDate))
}
func TestLoadInvalidXPath(t *testing.T) {
config := make(xpathScraperConfig)
config := make(mappedConfig)
config["Name"] = `//a[id=']/span`
config["Name"] = makeSimpleAttrConfig(`//a[id=']/span`)
reader := strings.NewReader(htmlDoc1)
doc, err := htmlquery.Parse(reader)
@@ -855,6 +766,68 @@ func TestLoadInvalidXPath(t *testing.T) {
return
}
common := make(commonXPathConfig)
config.process(doc, common)
q := &xpathQuery{
doc: doc,
}
config.process(q, nil)
}
func TestSubScrape(t *testing.T) {
retHTML := `
<div>
<a href="/getName">A link</a>
</div>
`
ssHTML := `
<span>The name</span>
`
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/getName" {
fmt.Fprint(w, ssHTML)
} else {
fmt.Fprint(w, retHTML)
}
}))
defer ts.Close()
yamlStr := `name: Test
performerByURL:
- action: scrapeXPath
url:
- ` + ts.URL + `
scraper: performerScraper
xPathScrapers:
performerScraper:
performer:
Name:
selector: //div/a/@href
postProcess:
- replace:
- regex: ^
with: ` + ts.URL + `
- subScraper:
selector: //span
`
c := &config{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err != nil {
t.Errorf("Error loading yaml: %s", err.Error())
return
}
globalConfig := GlobalConfig{}
performer, err := c.ScrapePerformerURL(ts.URL, globalConfig)
if err != nil {
t.Errorf("Error scraping performer: %s", err.Error())
return
}
verifyField(t, "The name", performer.Name, "Name")
}

View File

@@ -13,6 +13,7 @@ const markup = `
* Add support for parent/child studios.
### 🎨 Improvements
* Add mapped and fixed post-processing scraping options.
* Add random sorting for performers.
* Search for files which have low or upper case supported filename extensions.
* Add dialog when pasting movie images.

View File

@@ -209,15 +209,26 @@ performer:
This will set the `Name` attribute of the returned performer to the text content of the element that matches `<h1 itemprop="name">...`.
The value may also be a sub-object, indicating that post-processing is required. If it is a sub-object, then the xpath must be set to the `selector` key of the sub-object. For example, using the same xpath as above:
The value may also be a sub-object. If it is a sub-object, then the xpath must be set to the `selector` key of the sub-object. For example, using the same xpath as above:
```
performer:
Name:
selector: //h1[@itemprop="name"]
postProcess:
# post-processing config values
```
#### Fixed attribute values
Alternatively, an attribute value may be set to a fixed value, rather than scraping it from the webpage. This can be done by replacing `selector` with `fixed`. For example:
```
performer:
Gender:
fixed: Female
```
##### Common fragments
The `common` field is used to configure xpath fragments that can be referenced in the xpath strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example:
@@ -233,14 +244,44 @@ The `Measurements` xpath string will replace `$infoPiece` with `//div[@class="in
##### Post-processing options
The following post-processing keys are available:
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
* `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `` is the first capture group, `` the second and so on. Replacements are performed in order of the array.
Post-processing operations are contained in the `postProcess` key. Post-processing operations are performed in the order they are specified. The following post-processing operations are available:
* `map`: contains a map of input values to output values. Where a value matches one of the input values, it is replaced with the matching output value. If no value is matched, then value is unmodified.
Example:
```
performer:
Gender:
selector: //div[class="example element"]
postProcess:
- map:
F: Female
M: Male
```
Gets the contents of the selected div element, and sets the returned value to `Female` if the scraped value is `F`; `Male` if the scraped value is `M`.
* `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `$1` is the first capture group, `$2` the second and so on. Replacements are performed in order of the array.
Example:
```
CareerLength:
selector: $infoPiece[text() = 'Career Start and End:']/../span[@class="smallInfo"]
postProcess:
- replace:
- regex: \s+to\s+
with: "-"
```
Replaces `2001 to 2003` with `2001-2003`.
* `subScraper`: if present, the sub-scraper will be executed after all other post-processes are complete and before parseDate. It then takes the value and performs an http request, using the value as the URL. Within the `subScraper` config is a nested scraping configuration. This allows you to traverse to other webpages to get the attribute value you are after. For more info and examples have a look at [#370](https://github.com/stashapp/stash/pull/370), [#606](https://github.com/stashapp/stash/pull/606)
* `parseDate`: if present, the value is the date format using go's reference date (2006-01-02). For example, if an example date was `14-Mar-2003`, then the date format would be `02-Jan-2006`. See the [time.Parse documentation](https://golang.org/pkg/time/#Parse) for details. When present, the scraper will convert the input string into a date, then convert it to the string format used by stash (`YYYY-MM-DD`).
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
* `split`: Its the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
Post-processing is done in order of the fields above - `concat`, `regex`, `subscraper`, `parseDate` and then `split`.
For backwards compatibility, `regex`, `subscraper` and `parseDate` are also allowed as keys for the attribute.
Post-processing on attribute post-process is done in the following order: `concat`, `regex`, `subscraper`, `parseDate` and then `split`.
##### Example
@@ -272,7 +313,8 @@ xPathScrapers:
Measurements: $infoPiece[text() = 'Measurements:']/../span[@class="smallInfo"]
Height:
selector: $infoPiece[text() = 'Height:']/../span[@class="smallInfo"]
replace:
postProcess:
- replace:
- regex: .*\((\d+) cm\)
with: $1
Ethnicity: $infoPiece[text() = 'Ethnicity:']/../span[@class="smallInfo"]
@@ -281,7 +323,8 @@ xPathScrapers:
Tattoos: $infoPiece[text() = 'Tattoos:']/../span[@class="smallInfo"]
CareerLength:
selector: $infoPiece[text() = 'Career Start and End:']/../span[@class="smallInfo"]
replace:
postProcess:
- replace:
- regex: \s+to\s+
with: "-"
sceneScraper: