mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Refactor xpath scraper code. Add fixed and map (#616)
* Refactor xpath scraper code * Make post-process a list * Add map post-process action * Add fixed xpath values * Refactor scrapers into cache * Refactor into mapped config * Trim test html
This commit is contained in:
1
go.sum
1
go.sum
@@ -582,6 +582,7 @@ github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DM
|
|||||||
github.com/spf13/viper v1.4.0 h1:yXHLWeravcrgGyFSyCgdYpXQ9dR9c/WED3pg1RhxqEU=
|
github.com/spf13/viper v1.4.0 h1:yXHLWeravcrgGyFSyCgdYpXQ9dR9c/WED3pg1RhxqEU=
|
||||||
github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
|
github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/objx v0.1.1 h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A=
|
||||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
|
|||||||
@@ -3,11 +3,11 @@ package api
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/scraper"
|
"github.com/stashapp/stash/pkg/manager"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (r *mutationResolver) ReloadScrapers(ctx context.Context) (bool, error) {
|
func (r *mutationResolver) ReloadScrapers(ctx context.Context) (bool, error) {
|
||||||
err := scraper.ReloadScrapers()
|
err := manager.GetInstance().ScraperCache.ReloadScrapers()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package api
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
|
||||||
|
"github.com/stashapp/stash/pkg/manager"
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
"github.com/stashapp/stash/pkg/scraper"
|
"github.com/stashapp/stash/pkg/scraper"
|
||||||
)
|
)
|
||||||
@@ -12,12 +13,12 @@ func (r *queryResolver) ScrapeFreeones(ctx context.Context, performer_name strin
|
|||||||
scrapedPerformer := models.ScrapedPerformerInput{
|
scrapedPerformer := models.ScrapedPerformerInput{
|
||||||
Name: &performer_name,
|
Name: &performer_name,
|
||||||
}
|
}
|
||||||
return scraper.GetFreeonesScraper().ScrapePerformer(scrapedPerformer)
|
return manager.GetInstance().ScraperCache.ScrapePerformer(scraper.FreeonesScraperID, scrapedPerformer)
|
||||||
}
|
}
|
||||||
|
|
||||||
// deprecated
|
// deprecated
|
||||||
func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query string) ([]string, error) {
|
func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query string) ([]string, error) {
|
||||||
scrapedPerformers, err := scraper.GetFreeonesScraper().ScrapePerformerNames(query)
|
scrapedPerformers, err := manager.GetInstance().ScraperCache.ScrapePerformerList(scraper.FreeonesScraperID, query)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -33,11 +34,11 @@ func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query s
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ListPerformerScrapers(ctx context.Context) ([]*models.Scraper, error) {
|
func (r *queryResolver) ListPerformerScrapers(ctx context.Context) ([]*models.Scraper, error) {
|
||||||
return scraper.ListPerformerScrapers()
|
return manager.GetInstance().ScraperCache.ListPerformerScrapers(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ListSceneScrapers(ctx context.Context) ([]*models.Scraper, error) {
|
func (r *queryResolver) ListSceneScrapers(ctx context.Context) ([]*models.Scraper, error) {
|
||||||
return scraper.ListSceneScrapers()
|
return manager.GetInstance().ScraperCache.ListSceneScrapers(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID string, query string) ([]*models.ScrapedPerformer, error) {
|
func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID string, query string) ([]*models.ScrapedPerformer, error) {
|
||||||
@@ -45,21 +46,21 @@ func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID strin
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return scraper.ScrapePerformerList(scraperID, query)
|
return manager.GetInstance().ScraperCache.ScrapePerformerList(scraperID, query)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ScrapePerformer(ctx context.Context, scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
func (r *queryResolver) ScrapePerformer(ctx context.Context, scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||||
return scraper.ScrapePerformer(scraperID, scrapedPerformer)
|
return manager.GetInstance().ScraperCache.ScrapePerformer(scraperID, scrapedPerformer)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ScrapePerformerURL(ctx context.Context, url string) (*models.ScrapedPerformer, error) {
|
func (r *queryResolver) ScrapePerformerURL(ctx context.Context, url string) (*models.ScrapedPerformer, error) {
|
||||||
return scraper.ScrapePerformerURL(url)
|
return manager.GetInstance().ScraperCache.ScrapePerformerURL(url)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ScrapeScene(ctx context.Context, scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
func (r *queryResolver) ScrapeScene(ctx context.Context, scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||||
return scraper.ScrapeScene(scraperID, scene)
|
return manager.GetInstance().ScraperCache.ScrapeScene(scraperID, scene)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *queryResolver) ScrapeSceneURL(ctx context.Context, url string) (*models.ScrapedScene, error) {
|
func (r *queryResolver) ScrapeSceneURL(ctx context.Context, url string) (*models.ScrapedScene, error) {
|
||||||
return scraper.ScrapeSceneURL(url)
|
return manager.GetInstance().ScraperCache.ScrapeSceneURL(url)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
"github.com/stashapp/stash/pkg/manager/config"
|
||||||
"github.com/stashapp/stash/pkg/manager/paths"
|
"github.com/stashapp/stash/pkg/manager/paths"
|
||||||
|
"github.com/stashapp/stash/pkg/scraper"
|
||||||
"github.com/stashapp/stash/pkg/utils"
|
"github.com/stashapp/stash/pkg/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -20,6 +21,8 @@ type singleton struct {
|
|||||||
|
|
||||||
FFMPEGPath string
|
FFMPEGPath string
|
||||||
FFProbePath string
|
FFProbePath string
|
||||||
|
|
||||||
|
ScraperCache *scraper.Cache
|
||||||
}
|
}
|
||||||
|
|
||||||
var instance *singleton
|
var instance *singleton
|
||||||
@@ -47,6 +50,8 @@ func Initialize() *singleton {
|
|||||||
Status: TaskStatus{Status: Idle, Progress: -1},
|
Status: TaskStatus{Status: Idle, Progress: -1},
|
||||||
Paths: paths.NewPaths(),
|
Paths: paths.NewPaths(),
|
||||||
JSON: &jsonUtils{},
|
JSON: &jsonUtils{},
|
||||||
|
|
||||||
|
ScraperCache: initScraperCache(),
|
||||||
}
|
}
|
||||||
|
|
||||||
instance.RefreshConfig()
|
instance.RefreshConfig()
|
||||||
@@ -146,6 +151,20 @@ func initLog() {
|
|||||||
logger.Init(config.GetLogFile(), config.GetLogOut(), config.GetLogLevel())
|
logger.Init(config.GetLogFile(), config.GetLogOut(), config.GetLogLevel())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func initScraperCache() *scraper.Cache {
|
||||||
|
scraperConfig := scraper.GlobalConfig{
|
||||||
|
Path: config.GetScrapersPath(),
|
||||||
|
UserAgent: config.GetScraperUserAgent(),
|
||||||
|
}
|
||||||
|
ret, err := scraper.NewCache(scraperConfig)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
logger.Errorf("Error reading scraper configs: %s", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
func (s *singleton) RefreshConfig() {
|
func (s *singleton) RefreshConfig() {
|
||||||
s.Paths = paths.NewPaths()
|
s.Paths = paths.NewPaths()
|
||||||
if config.IsValid() {
|
if config.IsValid() {
|
||||||
|
|||||||
53
pkg/scraper/action.go
Normal file
53
pkg/scraper/action.go
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import "github.com/stashapp/stash/pkg/models"
|
||||||
|
|
||||||
|
type scraperAction string
|
||||||
|
|
||||||
|
const (
|
||||||
|
scraperActionScript scraperAction = "script"
|
||||||
|
scraperActionStash scraperAction = "stash"
|
||||||
|
scraperActionXPath scraperAction = "scrapeXPath"
|
||||||
|
)
|
||||||
|
|
||||||
|
var allScraperAction = []scraperAction{
|
||||||
|
scraperActionScript,
|
||||||
|
scraperActionStash,
|
||||||
|
scraperActionXPath,
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e scraperAction) IsValid() bool {
|
||||||
|
switch e {
|
||||||
|
case scraperActionScript, scraperActionStash, scraperActionXPath:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type scrapeOptions struct {
|
||||||
|
scraper scraperTypeConfig
|
||||||
|
config config
|
||||||
|
globalConfig GlobalConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
type scraper interface {
|
||||||
|
scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error)
|
||||||
|
scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error)
|
||||||
|
scrapePerformerByURL(url string) (*models.ScrapedPerformer, error)
|
||||||
|
|
||||||
|
scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error)
|
||||||
|
scrapeSceneByURL(url string) (*models.ScrapedScene, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) scraper {
|
||||||
|
switch scraper.Action {
|
||||||
|
case scraperActionScript:
|
||||||
|
return newScriptScraper(scraper, config, globalConfig)
|
||||||
|
case scraperActionStash:
|
||||||
|
return newStashScraper(scraper, config, globalConfig)
|
||||||
|
case scraperActionXPath:
|
||||||
|
return newXpathScraper(scraper, config, globalConfig)
|
||||||
|
}
|
||||||
|
|
||||||
|
panic("unknown scraper action: " + scraper.Action)
|
||||||
|
}
|
||||||
@@ -1,6 +1,8 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -11,32 +13,80 @@ import (
|
|||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type config struct {
|
||||||
|
ID string
|
||||||
|
path string
|
||||||
|
|
||||||
|
// The name of the scraper. This is displayed in the UI.
|
||||||
|
Name string `yaml:"name"`
|
||||||
|
|
||||||
|
// Configuration for querying performers by name
|
||||||
|
PerformerByName *scraperTypeConfig `yaml:"performerByName"`
|
||||||
|
|
||||||
|
// Configuration for querying performers by a Performer fragment
|
||||||
|
PerformerByFragment *scraperTypeConfig `yaml:"performerByFragment"`
|
||||||
|
|
||||||
|
// Configuration for querying a performer by a URL
|
||||||
|
PerformerByURL []*scrapeByURLConfig `yaml:"performerByURL"`
|
||||||
|
|
||||||
|
// Configuration for querying scenes by a Scene fragment
|
||||||
|
SceneByFragment *scraperTypeConfig `yaml:"sceneByFragment"`
|
||||||
|
|
||||||
|
// Configuration for querying a scene by a URL
|
||||||
|
SceneByURL []*scrapeByURLConfig `yaml:"sceneByURL"`
|
||||||
|
|
||||||
|
// Scraper debugging options
|
||||||
|
DebugOptions *scraperDebugOptions `yaml:"debug"`
|
||||||
|
|
||||||
|
// Stash server configuration
|
||||||
|
StashServer *stashServer `yaml:"stashServer"`
|
||||||
|
|
||||||
|
// Xpath scraping configurations
|
||||||
|
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c config) validate() error {
|
||||||
|
if strings.TrimSpace(c.Name) == "" {
|
||||||
|
return errors.New("name must not be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.PerformerByName != nil {
|
||||||
|
if err := c.PerformerByName.validate(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.PerformerByFragment != nil {
|
||||||
|
if err := c.PerformerByFragment.validate(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.SceneByFragment != nil {
|
||||||
|
if err := c.SceneByFragment.validate(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, s := range c.PerformerByURL {
|
||||||
|
if err := s.validate(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, s := range c.SceneByURL {
|
||||||
|
if err := s.validate(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
type stashServer struct {
|
type stashServer struct {
|
||||||
URL string `yaml:"url"`
|
URL string `yaml:"url"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type scraperAction string
|
|
||||||
|
|
||||||
const (
|
|
||||||
scraperActionScript scraperAction = "script"
|
|
||||||
scraperActionStash scraperAction = "stash"
|
|
||||||
scraperActionXPath scraperAction = "scrapeXPath"
|
|
||||||
)
|
|
||||||
|
|
||||||
var allScraperAction = []scraperAction{
|
|
||||||
scraperActionScript,
|
|
||||||
scraperActionStash,
|
|
||||||
scraperActionXPath,
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e scraperAction) IsValid() bool {
|
|
||||||
switch e {
|
|
||||||
case scraperActionScript, scraperActionStash, scraperActionXPath:
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
type scraperTypeConfig struct {
|
type scraperTypeConfig struct {
|
||||||
Action scraperAction `yaml:"action"`
|
Action scraperAction `yaml:"action"`
|
||||||
Script []string `yaml:"script,flow"`
|
Script []string `yaml:"script,flow"`
|
||||||
@@ -44,40 +94,18 @@ type scraperTypeConfig struct {
|
|||||||
|
|
||||||
// for xpath name scraper only
|
// for xpath name scraper only
|
||||||
QueryURL string `yaml:"queryURL"`
|
QueryURL string `yaml:"queryURL"`
|
||||||
|
|
||||||
scraperConfig *scraperConfig
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type scrapePerformerNamesFunc func(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error)
|
func (c scraperTypeConfig) validate() error {
|
||||||
|
if !c.Action.IsValid() {
|
||||||
type performerByNameConfig struct {
|
return fmt.Errorf("%s is not a valid scraper action", c.Action)
|
||||||
scraperTypeConfig `yaml:",inline"`
|
|
||||||
performScrape scrapePerformerNamesFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *performerByNameConfig) resolveFn() {
|
|
||||||
if c.Action == scraperActionScript {
|
|
||||||
c.performScrape = scrapePerformerNamesScript
|
|
||||||
} else if c.Action == scraperActionStash {
|
|
||||||
c.performScrape = scrapePerformerNamesStash
|
|
||||||
} else if c.Action == scraperActionXPath {
|
|
||||||
c.performScrape = scrapePerformerNamesXPath
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
type scrapePerformerFragmentFunc func(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error)
|
if c.Action == scraperActionScript && len(c.Script) == 0 {
|
||||||
|
return errors.New("script is mandatory for script scraper action")
|
||||||
type performerByFragmentConfig struct {
|
|
||||||
scraperTypeConfig `yaml:",inline"`
|
|
||||||
performScrape scrapePerformerFragmentFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *performerByFragmentConfig) resolveFn() {
|
|
||||||
if c.Action == scraperActionScript {
|
|
||||||
c.performScrape = scrapePerformerFragmentScript
|
|
||||||
} else if c.Action == scraperActionStash {
|
|
||||||
c.performScrape = scrapePerformerFragmentStash
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type scrapeByURLConfig struct {
|
type scrapeByURLConfig struct {
|
||||||
@@ -85,6 +113,14 @@ type scrapeByURLConfig struct {
|
|||||||
URL []string `yaml:"url,flow"`
|
URL []string `yaml:"url,flow"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c scrapeByURLConfig) validate() error {
|
||||||
|
if len(c.URL) == 0 {
|
||||||
|
return errors.New("url is mandatory for scrape by url scrapers")
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.scraperTypeConfig.validate()
|
||||||
|
}
|
||||||
|
|
||||||
func (c scrapeByURLConfig) matchesURL(url string) bool {
|
func (c scrapeByURLConfig) matchesURL(url string) bool {
|
||||||
for _, thisURL := range c.URL {
|
for _, thisURL := range c.URL {
|
||||||
if strings.Contains(url, thisURL) {
|
if strings.Contains(url, thisURL) {
|
||||||
@@ -95,71 +131,12 @@ func (c scrapeByURLConfig) matchesURL(url string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
type scrapePerformerByURLFunc func(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error)
|
|
||||||
|
|
||||||
type scrapePerformerByURLConfig struct {
|
|
||||||
scrapeByURLConfig `yaml:",inline"`
|
|
||||||
performScrape scrapePerformerByURLFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *scrapePerformerByURLConfig) resolveFn() {
|
|
||||||
if c.Action == scraperActionScript {
|
|
||||||
c.performScrape = scrapePerformerURLScript
|
|
||||||
} else if c.Action == scraperActionXPath {
|
|
||||||
c.performScrape = scrapePerformerURLXpath
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type scrapeSceneFragmentFunc func(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error)
|
|
||||||
|
|
||||||
type sceneByFragmentConfig struct {
|
|
||||||
scraperTypeConfig `yaml:",inline"`
|
|
||||||
performScrape scrapeSceneFragmentFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *sceneByFragmentConfig) resolveFn() {
|
|
||||||
if c.Action == scraperActionScript {
|
|
||||||
c.performScrape = scrapeSceneFragmentScript
|
|
||||||
} else if c.Action == scraperActionStash {
|
|
||||||
c.performScrape = scrapeSceneFragmentStash
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type scrapeSceneByURLFunc func(c scraperTypeConfig, url string) (*models.ScrapedScene, error)
|
|
||||||
|
|
||||||
type scrapeSceneByURLConfig struct {
|
|
||||||
scrapeByURLConfig `yaml:",inline"`
|
|
||||||
performScrape scrapeSceneByURLFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *scrapeSceneByURLConfig) resolveFn() {
|
|
||||||
if c.Action == scraperActionScript {
|
|
||||||
c.performScrape = scrapeSceneURLScript
|
|
||||||
} else if c.Action == scraperActionXPath {
|
|
||||||
c.performScrape = scrapeSceneURLXPath
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type scraperDebugOptions struct {
|
type scraperDebugOptions struct {
|
||||||
PrintHTML bool `yaml:"printHTML"`
|
PrintHTML bool `yaml:"printHTML"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type scraperConfig struct {
|
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
|
||||||
ID string
|
ret := &config{}
|
||||||
Name string `yaml:"name"`
|
|
||||||
PerformerByName *performerByNameConfig `yaml:"performerByName"`
|
|
||||||
PerformerByFragment *performerByFragmentConfig `yaml:"performerByFragment"`
|
|
||||||
PerformerByURL []*scrapePerformerByURLConfig `yaml:"performerByURL"`
|
|
||||||
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
|
|
||||||
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
|
|
||||||
|
|
||||||
DebugOptions *scraperDebugOptions `yaml:"debug"`
|
|
||||||
StashServer *stashServer `yaml:"stashServer"`
|
|
||||||
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func loadScraperFromYAML(id string, reader io.Reader) (*scraperConfig, error) {
|
|
||||||
ret := &scraperConfig{}
|
|
||||||
|
|
||||||
parser := yaml.NewDecoder(reader)
|
parser := yaml.NewDecoder(reader)
|
||||||
parser.SetStrict(true)
|
parser.SetStrict(true)
|
||||||
@@ -170,13 +147,14 @@ func loadScraperFromYAML(id string, reader io.Reader) (*scraperConfig, error) {
|
|||||||
|
|
||||||
ret.ID = id
|
ret.ID = id
|
||||||
|
|
||||||
// set the scraper interface
|
if err := ret.validate(); err != nil {
|
||||||
ret.initialiseConfigs()
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadScraperFromYAMLFile(path string) (*scraperConfig, error) {
|
func loadScraperFromYAMLFile(path string) (*config, error) {
|
||||||
file, err := os.Open(path)
|
file, err := os.Open(path)
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -187,34 +165,17 @@ func loadScraperFromYAMLFile(path string) (*scraperConfig, error) {
|
|||||||
id := filepath.Base(path)
|
id := filepath.Base(path)
|
||||||
id = id[:strings.LastIndex(id, ".")]
|
id = id[:strings.LastIndex(id, ".")]
|
||||||
|
|
||||||
return loadScraperFromYAML(id, file)
|
ret, err := loadScraperFromYAML(id, file)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.path = path
|
||||||
|
|
||||||
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *scraperConfig) initialiseConfigs() {
|
func (c config) toScraper() *models.Scraper {
|
||||||
if c.PerformerByName != nil {
|
|
||||||
c.PerformerByName.resolveFn()
|
|
||||||
c.PerformerByName.scraperConfig = c
|
|
||||||
}
|
|
||||||
if c.PerformerByFragment != nil {
|
|
||||||
c.PerformerByFragment.resolveFn()
|
|
||||||
c.PerformerByFragment.scraperConfig = c
|
|
||||||
}
|
|
||||||
for _, s := range c.PerformerByURL {
|
|
||||||
s.resolveFn()
|
|
||||||
s.scraperConfig = c
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.SceneByFragment != nil {
|
|
||||||
c.SceneByFragment.resolveFn()
|
|
||||||
c.SceneByFragment.scraperConfig = c
|
|
||||||
}
|
|
||||||
for _, s := range c.SceneByURL {
|
|
||||||
s.resolveFn()
|
|
||||||
s.scraperConfig = c
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c scraperConfig) toScraper() *models.Scraper {
|
|
||||||
ret := models.Scraper{
|
ret := models.Scraper{
|
||||||
ID: c.ID,
|
ID: c.ID,
|
||||||
Name: c.Name,
|
Name: c.Name,
|
||||||
@@ -256,11 +217,11 @@ func (c scraperConfig) toScraper() *models.Scraper {
|
|||||||
return &ret
|
return &ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) supportsPerformers() bool {
|
func (c config) supportsPerformers() bool {
|
||||||
return c.PerformerByName != nil || c.PerformerByFragment != nil || len(c.PerformerByURL) > 0
|
return c.PerformerByName != nil || c.PerformerByFragment != nil || len(c.PerformerByURL) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) matchesPerformerURL(url string) bool {
|
func (c config) matchesPerformerURL(url string) bool {
|
||||||
for _, scraper := range c.PerformerByURL {
|
for _, scraper := range c.PerformerByURL {
|
||||||
if scraper.matchesURL(url) {
|
if scraper.matchesURL(url) {
|
||||||
return true
|
return true
|
||||||
@@ -270,31 +231,34 @@ func (c scraperConfig) matchesPerformerURL(url string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) ScrapePerformerNames(name string) ([]*models.ScrapedPerformer, error) {
|
func (c config) ScrapePerformerNames(name string, globalConfig GlobalConfig) ([]*models.ScrapedPerformer, error) {
|
||||||
if c.PerformerByName != nil && c.PerformerByName.performScrape != nil {
|
if c.PerformerByName != nil {
|
||||||
return c.PerformerByName.performScrape(c.PerformerByName.scraperTypeConfig, name)
|
s := getScraper(*c.PerformerByName, c, globalConfig)
|
||||||
|
return s.scrapePerformersByName(name)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
func (c config) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
|
||||||
if c.PerformerByFragment != nil && c.PerformerByFragment.performScrape != nil {
|
if c.PerformerByFragment != nil {
|
||||||
return c.PerformerByFragment.performScrape(c.PerformerByFragment.scraperTypeConfig, scrapedPerformer)
|
s := getScraper(*c.PerformerByFragment, c, globalConfig)
|
||||||
|
return s.scrapePerformerByFragment(scrapedPerformer)
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to match against URL if present
|
// try to match against URL if present
|
||||||
if scrapedPerformer.URL != nil && *scrapedPerformer.URL != "" {
|
if scrapedPerformer.URL != nil && *scrapedPerformer.URL != "" {
|
||||||
return c.ScrapePerformerURL(*scrapedPerformer.URL)
|
return c.ScrapePerformerURL(*scrapedPerformer.URL, globalConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
|
func (c config) ScrapePerformerURL(url string, globalConfig GlobalConfig) (*models.ScrapedPerformer, error) {
|
||||||
for _, scraper := range c.PerformerByURL {
|
for _, scraper := range c.PerformerByURL {
|
||||||
if scraper.matchesURL(url) && scraper.performScrape != nil {
|
if scraper.matchesURL(url) {
|
||||||
ret, err := scraper.performScrape(scraper.scraperTypeConfig, url)
|
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
|
||||||
|
ret, err := s.scrapePerformerByURL(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -308,11 +272,11 @@ func (c scraperConfig) ScrapePerformerURL(url string) (*models.ScrapedPerformer,
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) supportsScenes() bool {
|
func (c config) supportsScenes() bool {
|
||||||
return c.SceneByFragment != nil || len(c.SceneByURL) > 0
|
return c.SceneByFragment != nil || len(c.SceneByURL) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) matchesSceneURL(url string) bool {
|
func (c config) matchesSceneURL(url string) bool {
|
||||||
for _, scraper := range c.SceneByURL {
|
for _, scraper := range c.SceneByURL {
|
||||||
if scraper.matchesURL(url) {
|
if scraper.matchesURL(url) {
|
||||||
return true
|
return true
|
||||||
@@ -322,18 +286,20 @@ func (c scraperConfig) matchesSceneURL(url string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) ScrapeScene(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
func (c config) ScrapeScene(scene models.SceneUpdateInput, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
|
||||||
if c.SceneByFragment != nil && c.SceneByFragment.performScrape != nil {
|
if c.SceneByFragment != nil {
|
||||||
return c.SceneByFragment.performScrape(c.SceneByFragment.scraperTypeConfig, scene)
|
s := getScraper(*c.SceneByFragment, c, globalConfig)
|
||||||
|
return s.scrapeSceneByFragment(scene)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c scraperConfig) ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
|
func (c config) ScrapeSceneURL(url string, globalConfig GlobalConfig) (*models.ScrapedScene, error) {
|
||||||
for _, scraper := range c.SceneByURL {
|
for _, scraper := range c.SceneByURL {
|
||||||
if scraper.matchesURL(url) && scraper.performScrape != nil {
|
if scraper.matchesURL(url) {
|
||||||
ret, err := scraper.performScrape(scraper.scraperTypeConfig, url)
|
s := getScraper(scraper.scraperTypeConfig, c, globalConfig)
|
||||||
|
ret, err := s.scrapeSceneByURL(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,8 @@ import (
|
|||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
)
|
)
|
||||||
|
|
||||||
const freeonesScraperID = "builtin_freeones"
|
// FreeonesScraperID is the scraper ID for the built-in Freeones scraper
|
||||||
|
const FreeonesScraperID = "builtin_freeones"
|
||||||
|
|
||||||
// 537: stolen from: https://github.com/stashapp/CommunityScrapers/blob/master/scrapers/FreeonesCommunity.yml
|
// 537: stolen from: https://github.com/stashapp/CommunityScrapers/blob/master/scrapers/FreeonesCommunity.yml
|
||||||
const freeonesScraperConfig = `
|
const freeonesScraperConfig = `
|
||||||
@@ -103,10 +104,10 @@ xPathScrapers:
|
|||||||
# Last updated June 15, 2020
|
# Last updated June 15, 2020
|
||||||
`
|
`
|
||||||
|
|
||||||
func GetFreeonesScraper() scraperConfig {
|
func getFreeonesScraper() config {
|
||||||
yml := freeonesScraperConfig
|
yml := freeonesScraperConfig
|
||||||
|
|
||||||
scraper, err := loadScraperFromYAML(freeonesScraperID, strings.NewReader(yml))
|
scraper, err := loadScraperFromYAML(FreeonesScraperID, strings.NewReader(yml))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error())
|
logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
"github.com/stashapp/stash/pkg/utils"
|
"github.com/stashapp/stash/pkg/utils"
|
||||||
)
|
)
|
||||||
@@ -15,13 +14,13 @@ import (
|
|||||||
// configurable at some point.
|
// configurable at some point.
|
||||||
const imageGetTimeout = time.Second * 30
|
const imageGetTimeout = time.Second * 30
|
||||||
|
|
||||||
func setPerformerImage(p *models.ScrapedPerformer) error {
|
func setPerformerImage(p *models.ScrapedPerformer, globalConfig GlobalConfig) error {
|
||||||
if p == nil || p.Image == nil || !strings.HasPrefix(*p.Image, "http") {
|
if p == nil || p.Image == nil || !strings.HasPrefix(*p.Image, "http") {
|
||||||
// nothing to do
|
// nothing to do
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
img, err := getImage(*p.Image)
|
img, err := getImage(*p.Image, globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -31,14 +30,14 @@ func setPerformerImage(p *models.ScrapedPerformer) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func setSceneImage(s *models.ScrapedScene) error {
|
func setSceneImage(s *models.ScrapedScene, globalConfig GlobalConfig) error {
|
||||||
// don't try to get the image if it doesn't appear to be a URL
|
// don't try to get the image if it doesn't appear to be a URL
|
||||||
if s == nil || s.Image == nil || !strings.HasPrefix(*s.Image, "http") {
|
if s == nil || s.Image == nil || !strings.HasPrefix(*s.Image, "http") {
|
||||||
// nothing to do
|
// nothing to do
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
img, err := getImage(*s.Image)
|
img, err := getImage(*s.Image, globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -48,7 +47,7 @@ func setSceneImage(s *models.ScrapedScene) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getImage(url string) (*string, error) {
|
func getImage(url string, globalConfig GlobalConfig) (*string, error) {
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
Timeout: imageGetTimeout,
|
Timeout: imageGetTimeout,
|
||||||
}
|
}
|
||||||
@@ -58,7 +57,7 @@ func getImage(url string) (*string, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
userAgent := config.GetScraperUserAgent()
|
userAgent := globalConfig.UserAgent
|
||||||
if userAgent != "" {
|
if userAgent != "" {
|
||||||
req.Header.Set("User-Agent", userAgent)
|
req.Header.Set("User-Agent", userAgent)
|
||||||
}
|
}
|
||||||
@@ -93,10 +92,10 @@ func getImage(url string) (*string, error) {
|
|||||||
return &img, nil
|
return &img, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getStashPerformerImage(stashURL string, performerID string) (*string, error) {
|
func getStashPerformerImage(stashURL string, performerID string, globalConfig GlobalConfig) (*string, error) {
|
||||||
return getImage(stashURL + "/performer/" + performerID + "/image")
|
return getImage(stashURL+"/performer/"+performerID+"/image", globalConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
func getStashSceneImage(stashURL string, sceneID string) (*string, error) {
|
func getStashSceneImage(stashURL string, sceneID string, globalConfig GlobalConfig) (*string, error) {
|
||||||
return getImage(stashURL + "/scene/" + sceneID + "/screenshot")
|
return getImage(stashURL+"/scene/"+sceneID+"/screenshot", globalConfig)
|
||||||
}
|
}
|
||||||
|
|||||||
600
pkg/scraper/mapped.go
Normal file
600
pkg/scraper/mapped.go
Normal file
@@ -0,0 +1,600 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
|
"github.com/stashapp/stash/pkg/models"
|
||||||
|
"gopkg.in/yaml.v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mappedQuery interface {
|
||||||
|
runQuery(selector string) []string
|
||||||
|
subScrape(value string) mappedQuery
|
||||||
|
}
|
||||||
|
|
||||||
|
type commonMappedConfig map[string]string
|
||||||
|
|
||||||
|
type mappedConfig map[string]mappedScraperAttrConfig
|
||||||
|
|
||||||
|
func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
|
||||||
|
if c == nil {
|
||||||
|
return src
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := src
|
||||||
|
for commonKey, commonVal := range c {
|
||||||
|
if strings.Contains(ret, commonKey) {
|
||||||
|
ret = strings.Replace(ret, commonKey, commonVal, -1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s mappedConfig) process(q mappedQuery, common commonMappedConfig) mappedResults {
|
||||||
|
var ret mappedResults
|
||||||
|
|
||||||
|
for k, attrConfig := range s {
|
||||||
|
|
||||||
|
if attrConfig.Fixed != "" {
|
||||||
|
// TODO - not sure if this needs to set _all_ indexes for the key
|
||||||
|
const i = 0
|
||||||
|
ret = ret.setKey(i, k, attrConfig.Fixed)
|
||||||
|
} else {
|
||||||
|
selector := attrConfig.Selector
|
||||||
|
selector = s.applyCommon(common, selector)
|
||||||
|
|
||||||
|
found := q.runQuery(selector)
|
||||||
|
|
||||||
|
if len(found) > 0 {
|
||||||
|
result := s.postProcess(q, attrConfig, found)
|
||||||
|
for i, text := range result {
|
||||||
|
ret = ret.setKey(i, k, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrConfig, found []string) []string {
|
||||||
|
// check if we're concatenating the results into a single result
|
||||||
|
var ret []string
|
||||||
|
if attrConfig.hasConcat() {
|
||||||
|
result := attrConfig.concatenateResults(found)
|
||||||
|
result = attrConfig.postProcess(result, q)
|
||||||
|
if attrConfig.hasSplit() {
|
||||||
|
return attrConfig.splitString(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = []string{result}
|
||||||
|
} else {
|
||||||
|
for _, text := range found {
|
||||||
|
text = attrConfig.postProcess(text, q)
|
||||||
|
if attrConfig.hasSplit() {
|
||||||
|
return attrConfig.splitString(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = append(ret, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedSceneScraperConfig struct {
|
||||||
|
mappedConfig
|
||||||
|
|
||||||
|
Tags mappedConfig `yaml:"Tags"`
|
||||||
|
Performers mappedConfig `yaml:"Performers"`
|
||||||
|
Studio mappedConfig `yaml:"Studio"`
|
||||||
|
Movies mappedConfig `yaml:"Movies"`
|
||||||
|
}
|
||||||
|
type _mappedSceneScraperConfig mappedSceneScraperConfig
|
||||||
|
|
||||||
|
const (
|
||||||
|
mappedScraperConfigSceneTags = "Tags"
|
||||||
|
mappedScraperConfigScenePerformers = "Performers"
|
||||||
|
mappedScraperConfigSceneStudio = "Studio"
|
||||||
|
mappedScraperConfigSceneMovies = "Movies"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *mappedSceneScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
|
// HACK - unmarshal to map first, then remove known scene sub-fields, then
|
||||||
|
// remarshal to yaml and pass that down to the base map
|
||||||
|
parentMap := make(map[string]interface{})
|
||||||
|
if err := unmarshal(parentMap); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// move the known sub-fields to a separate map
|
||||||
|
thisMap := make(map[string]interface{})
|
||||||
|
|
||||||
|
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
|
||||||
|
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
|
||||||
|
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
|
||||||
|
thisMap[mappedScraperConfigSceneMovies] = parentMap[mappedScraperConfigSceneMovies]
|
||||||
|
|
||||||
|
delete(parentMap, mappedScraperConfigSceneTags)
|
||||||
|
delete(parentMap, mappedScraperConfigScenePerformers)
|
||||||
|
delete(parentMap, mappedScraperConfigSceneStudio)
|
||||||
|
delete(parentMap, mappedScraperConfigSceneMovies)
|
||||||
|
|
||||||
|
// re-unmarshal the sub-fields
|
||||||
|
yml, err := yaml.Marshal(thisMap)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// needs to be a different type to prevent infinite recursion
|
||||||
|
c := _mappedSceneScraperConfig{}
|
||||||
|
if err := yaml.Unmarshal(yml, &c); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = mappedSceneScraperConfig(c)
|
||||||
|
|
||||||
|
yml, err = yaml.Marshal(parentMap)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedPerformerScraperConfig struct {
|
||||||
|
mappedConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *mappedPerformerScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
|
return unmarshal(&s.mappedConfig)
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedRegexConfig struct {
|
||||||
|
Regex string `yaml:"regex"`
|
||||||
|
With string `yaml:"with"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedRegexConfigs []mappedRegexConfig
|
||||||
|
|
||||||
|
func (c mappedRegexConfig) apply(value string) string {
|
||||||
|
if c.Regex != "" {
|
||||||
|
re, err := regexp.Compile(c.Regex)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warnf("Error compiling regex '%s': %s", c.Regex, err.Error())
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := re.ReplaceAllString(value, c.With)
|
||||||
|
|
||||||
|
// trim leading and trailing whitespace
|
||||||
|
// this is done to maintain backwards compatibility with existing
|
||||||
|
// scrapers
|
||||||
|
ret = strings.TrimSpace(ret)
|
||||||
|
|
||||||
|
logger.Debugf(`Replace: '%s' with '%s'`, c.Regex, c.With)
|
||||||
|
logger.Debugf("Before: %s", value)
|
||||||
|
logger.Debugf("After: %s", ret)
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedRegexConfigs) apply(value string) string {
|
||||||
|
// apply regex in order
|
||||||
|
for _, config := range c {
|
||||||
|
value = config.apply(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
type postProcessAction interface {
|
||||||
|
Apply(value string, q mappedQuery) string
|
||||||
|
}
|
||||||
|
|
||||||
|
type postProcessParseDate string
|
||||||
|
|
||||||
|
func (p *postProcessParseDate) Apply(value string, q mappedQuery) string {
|
||||||
|
parseDate := string(*p)
|
||||||
|
|
||||||
|
if parseDate == "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
// try to parse the date using the pattern
|
||||||
|
// if it fails, then just fall back to the original value
|
||||||
|
parsedValue, err := time.Parse(parseDate, value)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert it into our date format
|
||||||
|
const internalDateFormat = "2006-01-02"
|
||||||
|
return parsedValue.Format(internalDateFormat)
|
||||||
|
}
|
||||||
|
|
||||||
|
type postProcessReplace mappedRegexConfigs
|
||||||
|
|
||||||
|
func (c *postProcessReplace) Apply(value string, q mappedQuery) string {
|
||||||
|
replace := mappedRegexConfigs(*c)
|
||||||
|
return replace.apply(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
type postProcessSubScraper mappedScraperAttrConfig
|
||||||
|
|
||||||
|
func (p *postProcessSubScraper) Apply(value string, q mappedQuery) string {
|
||||||
|
subScrapeConfig := mappedScraperAttrConfig(*p)
|
||||||
|
|
||||||
|
logger.Debugf("Sub-scraping for: %s", value)
|
||||||
|
ss := q.subScrape(value)
|
||||||
|
|
||||||
|
if ss != nil {
|
||||||
|
found := ss.runQuery(subScrapeConfig.Selector)
|
||||||
|
|
||||||
|
if len(found) > 0 {
|
||||||
|
// check if we're concatenating the results into a single result
|
||||||
|
var result string
|
||||||
|
if subScrapeConfig.hasConcat() {
|
||||||
|
result = subScrapeConfig.concatenateResults(found)
|
||||||
|
} else {
|
||||||
|
result = found[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
result = subScrapeConfig.postProcess(result, ss)
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
type postProcessMap map[string]string
|
||||||
|
|
||||||
|
func (p *postProcessMap) Apply(value string, q mappedQuery) string {
|
||||||
|
// return the mapped value if present
|
||||||
|
m := *p
|
||||||
|
mapped, ok := m[value]
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
return mapped
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedPostProcessAction struct {
|
||||||
|
ParseDate string `yaml:"parseDate"`
|
||||||
|
Replace mappedRegexConfigs `yaml:"replace"`
|
||||||
|
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
|
||||||
|
Map map[string]string `yaml:"map"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error) {
|
||||||
|
var found string
|
||||||
|
var ret postProcessAction
|
||||||
|
|
||||||
|
if a.ParseDate != "" {
|
||||||
|
found = "parseDate"
|
||||||
|
action := postProcessParseDate(a.ParseDate)
|
||||||
|
ret = &action
|
||||||
|
}
|
||||||
|
if len(a.Replace) > 0 {
|
||||||
|
if found != "" {
|
||||||
|
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "replace")
|
||||||
|
}
|
||||||
|
found = "replace"
|
||||||
|
action := postProcessReplace(a.Replace)
|
||||||
|
ret = &action
|
||||||
|
}
|
||||||
|
if a.SubScraper != nil {
|
||||||
|
if found != "" {
|
||||||
|
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "subScraper")
|
||||||
|
}
|
||||||
|
found = "subScraper"
|
||||||
|
action := postProcessSubScraper(*a.SubScraper)
|
||||||
|
ret = &action
|
||||||
|
}
|
||||||
|
if a.Map != nil {
|
||||||
|
if found != "" {
|
||||||
|
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "map")
|
||||||
|
}
|
||||||
|
found = "map"
|
||||||
|
action := postProcessMap(a.Map)
|
||||||
|
ret = &action
|
||||||
|
}
|
||||||
|
|
||||||
|
if ret == nil {
|
||||||
|
return nil, errors.New("invalid post-process action")
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedScraperAttrConfig struct {
|
||||||
|
Selector string `yaml:"selector"`
|
||||||
|
Fixed string `yaml:"fixed"`
|
||||||
|
PostProcess []mappedPostProcessAction `yaml:"postProcess"`
|
||||||
|
Concat string `yaml:"concat"`
|
||||||
|
Split string `yaml:"split"`
|
||||||
|
|
||||||
|
postProcessActions []postProcessAction
|
||||||
|
|
||||||
|
// deprecated: use PostProcess instead
|
||||||
|
ParseDate string `yaml:"parseDate"`
|
||||||
|
Replace mappedRegexConfigs `yaml:"replace"`
|
||||||
|
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type _mappedScraperAttrConfig mappedScraperAttrConfig
|
||||||
|
|
||||||
|
func (c *mappedScraperAttrConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
|
// try unmarshalling into a string first
|
||||||
|
if err := unmarshal(&c.Selector); err != nil {
|
||||||
|
// if it's a type error then we try to unmarshall to the full object
|
||||||
|
if _, ok := err.(*yaml.TypeError); !ok {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// unmarshall to full object
|
||||||
|
// need it as a separate object
|
||||||
|
t := _mappedScraperAttrConfig{}
|
||||||
|
if err = unmarshal(&t); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
*c = mappedScraperAttrConfig(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.convertPostProcessActions()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *mappedScraperAttrConfig) convertPostProcessActions() error {
|
||||||
|
// ensure we don't have the old deprecated fields and the new post process field
|
||||||
|
if len(c.PostProcess) > 0 {
|
||||||
|
if c.ParseDate != "" || len(c.Replace) > 0 || c.SubScraper != nil {
|
||||||
|
return errors.New("cannot include postProcess and (parseDate, replace, subScraper) deprecated fields")
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert xpathPostProcessAction actions to postProcessActions
|
||||||
|
for _, a := range c.PostProcess {
|
||||||
|
action, err := a.ToPostProcessAction()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.postProcessActions = append(c.postProcessActions, action)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.PostProcess = nil
|
||||||
|
} else {
|
||||||
|
// convert old deprecated fields if present
|
||||||
|
// in same order as they used to be executed
|
||||||
|
if len(c.Replace) > 0 {
|
||||||
|
action := postProcessReplace(c.Replace)
|
||||||
|
c.postProcessActions = append(c.postProcessActions, &action)
|
||||||
|
c.Replace = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.SubScraper != nil {
|
||||||
|
action := postProcessSubScraper(*c.SubScraper)
|
||||||
|
c.postProcessActions = append(c.postProcessActions, &action)
|
||||||
|
c.SubScraper = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.ParseDate != "" {
|
||||||
|
action := postProcessParseDate(c.ParseDate)
|
||||||
|
c.postProcessActions = append(c.postProcessActions, &action)
|
||||||
|
c.ParseDate = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedScraperAttrConfig) hasConcat() bool {
|
||||||
|
return c.Concat != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedScraperAttrConfig) hasSplit() bool {
|
||||||
|
return c.Split != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
|
||||||
|
separator := c.Concat
|
||||||
|
result := []string{}
|
||||||
|
|
||||||
|
for _, text := range nodes {
|
||||||
|
result = append(result, text)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Join(result, separator)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedScraperAttrConfig) splitString(value string) []string {
|
||||||
|
separator := c.Split
|
||||||
|
var res []string
|
||||||
|
|
||||||
|
if separator == "" {
|
||||||
|
return []string{value}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, str := range strings.Split(value, separator) {
|
||||||
|
if str != "" {
|
||||||
|
res = append(res, str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c mappedScraperAttrConfig) postProcess(value string, q mappedQuery) string {
|
||||||
|
for _, action := range c.postProcessActions {
|
||||||
|
value = action.Apply(value, q)
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedScrapers map[string]*mappedScraper
|
||||||
|
|
||||||
|
type mappedScraper struct {
|
||||||
|
Common commonMappedConfig `yaml:"common"`
|
||||||
|
Scene *mappedSceneScraperConfig `yaml:"scene"`
|
||||||
|
Performer *mappedPerformerScraperConfig `yaml:"performer"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type mappedResult map[string]string
|
||||||
|
type mappedResults []mappedResult
|
||||||
|
|
||||||
|
func (r mappedResult) apply(dest interface{}) {
|
||||||
|
destVal := reflect.ValueOf(dest)
|
||||||
|
|
||||||
|
// dest should be a pointer
|
||||||
|
destVal = destVal.Elem()
|
||||||
|
|
||||||
|
for key, value := range r {
|
||||||
|
field := destVal.FieldByName(key)
|
||||||
|
|
||||||
|
if field.IsValid() {
|
||||||
|
var reflectValue reflect.Value
|
||||||
|
if field.Kind() == reflect.Ptr {
|
||||||
|
// need to copy the value, otherwise everything is set to the
|
||||||
|
// same pointer
|
||||||
|
localValue := value
|
||||||
|
reflectValue = reflect.ValueOf(&localValue)
|
||||||
|
} else {
|
||||||
|
reflectValue = reflect.ValueOf(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
field.Set(reflectValue)
|
||||||
|
} else {
|
||||||
|
logger.Errorf("Field %s does not exist in %T", key, dest)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r mappedResults) setKey(index int, key string, value string) mappedResults {
|
||||||
|
if index >= len(r) {
|
||||||
|
r = append(r, make(mappedResult))
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Debugf(`[%d][%s] = %s`, index, key, value)
|
||||||
|
r[index][key] = value
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s mappedScraper) scrapePerformer(q mappedQuery) (*models.ScrapedPerformer, error) {
|
||||||
|
var ret models.ScrapedPerformer
|
||||||
|
|
||||||
|
performerMap := s.Performer
|
||||||
|
if performerMap == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
results := performerMap.process(q, s.Common)
|
||||||
|
if len(results) > 0 {
|
||||||
|
results[0].apply(&ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s mappedScraper) scrapePerformers(q mappedQuery) ([]*models.ScrapedPerformer, error) {
|
||||||
|
var ret []*models.ScrapedPerformer
|
||||||
|
|
||||||
|
performerMap := s.Performer
|
||||||
|
if performerMap == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
results := performerMap.process(q, s.Common)
|
||||||
|
for _, r := range results {
|
||||||
|
var p models.ScrapedPerformer
|
||||||
|
r.apply(&p)
|
||||||
|
ret = append(ret, &p)
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s mappedScraper) scrapeScene(q mappedQuery) (*models.ScrapedScene, error) {
|
||||||
|
var ret models.ScrapedScene
|
||||||
|
|
||||||
|
sceneScraperConfig := s.Scene
|
||||||
|
sceneMap := sceneScraperConfig.mappedConfig
|
||||||
|
if sceneMap == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
scenePerformersMap := sceneScraperConfig.Performers
|
||||||
|
sceneTagsMap := sceneScraperConfig.Tags
|
||||||
|
sceneStudioMap := sceneScraperConfig.Studio
|
||||||
|
sceneMoviesMap := sceneScraperConfig.Movies
|
||||||
|
|
||||||
|
logger.Debug(`Processing scene:`)
|
||||||
|
results := sceneMap.process(q, s.Common)
|
||||||
|
if len(results) > 0 {
|
||||||
|
results[0].apply(&ret)
|
||||||
|
|
||||||
|
// now apply the performers and tags
|
||||||
|
if scenePerformersMap != nil {
|
||||||
|
logger.Debug(`Processing scene performers:`)
|
||||||
|
performerResults := scenePerformersMap.process(q, s.Common)
|
||||||
|
|
||||||
|
for _, p := range performerResults {
|
||||||
|
performer := &models.ScrapedScenePerformer{}
|
||||||
|
p.apply(performer)
|
||||||
|
ret.Performers = append(ret.Performers, performer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sceneTagsMap != nil {
|
||||||
|
logger.Debug(`Processing scene tags:`)
|
||||||
|
tagResults := sceneTagsMap.process(q, s.Common)
|
||||||
|
|
||||||
|
for _, p := range tagResults {
|
||||||
|
tag := &models.ScrapedSceneTag{}
|
||||||
|
p.apply(tag)
|
||||||
|
ret.Tags = append(ret.Tags, tag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sceneStudioMap != nil {
|
||||||
|
logger.Debug(`Processing scene studio:`)
|
||||||
|
studioResults := sceneStudioMap.process(q, s.Common)
|
||||||
|
|
||||||
|
if len(studioResults) > 0 {
|
||||||
|
studio := &models.ScrapedSceneStudio{}
|
||||||
|
studioResults[0].apply(studio)
|
||||||
|
ret.Studio = studio
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sceneMoviesMap != nil {
|
||||||
|
logger.Debug(`Processing scene movies:`)
|
||||||
|
movieResults := sceneMoviesMap.process(q, s.Common)
|
||||||
|
|
||||||
|
for _, p := range movieResults {
|
||||||
|
movie := &models.ScrapedSceneMovie{}
|
||||||
|
p.apply(movie)
|
||||||
|
ret.Movies = append(ret.Movies, movie)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ret, nil
|
||||||
|
}
|
||||||
31
pkg/scraper/mapped_test.go
Normal file
31
pkg/scraper/mapped_test.go
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gopkg.in/yaml.v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInvalidPostProcessAction(t *testing.T) {
|
||||||
|
yamlStr := `name: Test
|
||||||
|
performerByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
scraper: performerScraper
|
||||||
|
xPathScrapers:
|
||||||
|
performerScraper:
|
||||||
|
performer:
|
||||||
|
Name:
|
||||||
|
selector: //div/a/@href
|
||||||
|
postProcess:
|
||||||
|
- parseDate: Jan 2, 2006
|
||||||
|
- anything
|
||||||
|
`
|
||||||
|
|
||||||
|
c := &config{}
|
||||||
|
err := yaml.Unmarshal([]byte(yamlStr), &c)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error unmarshalling with invalid post-process action")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,19 +7,42 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
var scrapers []scraperConfig
|
// GlobalConfig contains the global scraper options.
|
||||||
|
type GlobalConfig struct {
|
||||||
|
// User Agent used when scraping using http.
|
||||||
|
UserAgent string
|
||||||
|
Path string
|
||||||
|
}
|
||||||
|
|
||||||
func loadScrapers() ([]scraperConfig, error) {
|
// Cache stores scraper details.
|
||||||
if scrapers != nil {
|
type Cache struct {
|
||||||
return scrapers, nil
|
scrapers []config
|
||||||
|
globalConfig GlobalConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewCache returns a new Cache loading scraper configurations from the
|
||||||
|
// scraper path provided in the global config object. It returns a new
|
||||||
|
// instance and an error if the scraper directory could not be loaded.
|
||||||
|
//
|
||||||
|
// Scraper configurations are loaded from yml files in the provided scrapers
|
||||||
|
// directory and any subdirectories.
|
||||||
|
func NewCache(globalConfig GlobalConfig) (*Cache, error) {
|
||||||
|
scrapers, err := loadScrapers(globalConfig.Path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
path := config.GetScrapersPath()
|
return &Cache{
|
||||||
scrapers = make([]scraperConfig, 0)
|
globalConfig: globalConfig,
|
||||||
|
scrapers: scrapers,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadScrapers(path string) ([]config, error) {
|
||||||
|
scrapers := make([]config, 0)
|
||||||
|
|
||||||
logger.Debugf("Reading scraper configs from %s", path)
|
logger.Debugf("Reading scraper configs from %s", path)
|
||||||
scraperFiles := []string{}
|
scraperFiles := []string{}
|
||||||
@@ -36,7 +59,7 @@ func loadScrapers() ([]scraperConfig, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add built-in freeones scraper
|
// add built-in freeones scraper
|
||||||
scrapers = append(scrapers, GetFreeonesScraper())
|
scrapers = append(scrapers, getFreeonesScraper())
|
||||||
|
|
||||||
for _, file := range scraperFiles {
|
for _, file := range scraperFiles {
|
||||||
scraper, err := loadScraperFromYAMLFile(file)
|
scraper, err := loadScraperFromYAMLFile(file)
|
||||||
@@ -50,55 +73,55 @@ func loadScrapers() ([]scraperConfig, error) {
|
|||||||
return scrapers, nil
|
return scrapers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ReloadScrapers() error {
|
// ReloadScrapers clears the scraper cache and reloads from the scraper path.
|
||||||
scrapers = nil
|
// In the event of an error during loading, the cache will be left empty.
|
||||||
_, err := loadScrapers()
|
func (c *Cache) ReloadScrapers() error {
|
||||||
return err
|
c.scrapers = nil
|
||||||
}
|
scrapers, err := loadScrapers(c.globalConfig.Path)
|
||||||
|
|
||||||
func ListPerformerScrapers() ([]*models.Scraper, error) {
|
|
||||||
// read scraper config files from the directory and cache
|
|
||||||
scrapers, err := loadScrapers()
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.scrapers = scrapers
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateConfig updates the global config for the cache. If the scraper path
|
||||||
|
// has changed, ReloadScrapers will need to be called separately.
|
||||||
|
func (c *Cache) UpdateConfig(globalConfig GlobalConfig) {
|
||||||
|
c.globalConfig = globalConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListPerformerScrapers returns a list of scrapers that are capable of
|
||||||
|
// scraping performers.
|
||||||
|
func (c Cache) ListPerformerScrapers() []*models.Scraper {
|
||||||
var ret []*models.Scraper
|
var ret []*models.Scraper
|
||||||
for _, s := range scrapers {
|
for _, s := range c.scrapers {
|
||||||
// filter on type
|
// filter on type
|
||||||
if s.supportsPerformers() {
|
if s.supportsPerformers() {
|
||||||
ret = append(ret, s.toScraper())
|
ret = append(ret, s.toScraper())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret, nil
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func ListSceneScrapers() ([]*models.Scraper, error) {
|
// ListSceneScrapers returns a list of scrapers that are capable of
|
||||||
// read scraper config files from the directory and cache
|
// scraping scenes.
|
||||||
scrapers, err := loadScrapers()
|
func (c Cache) ListSceneScrapers() []*models.Scraper {
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var ret []*models.Scraper
|
var ret []*models.Scraper
|
||||||
for _, s := range scrapers {
|
for _, s := range c.scrapers {
|
||||||
// filter on type
|
// filter on type
|
||||||
if s.supportsScenes() {
|
if s.supportsScenes() {
|
||||||
ret = append(ret, s.toScraper())
|
ret = append(ret, s.toScraper())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret, nil
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func findScraper(scraperID string) *scraperConfig {
|
func (c Cache) findScraper(scraperID string) *config {
|
||||||
// read scraper config files from the directory and cache
|
for _, s := range c.scrapers {
|
||||||
loadScrapers()
|
|
||||||
|
|
||||||
for _, s := range scrapers {
|
|
||||||
if s.ID == scraperID {
|
if s.ID == scraperID {
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
@@ -107,27 +130,32 @@ func findScraper(scraperID string) *scraperConfig {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
|
// ScrapePerformerList uses the scraper with the provided ID to query for
|
||||||
|
// performers using the provided query string. It returns a list of
|
||||||
|
// scraped performer data.
|
||||||
|
func (c Cache) ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
|
||||||
// find scraper with the provided id
|
// find scraper with the provided id
|
||||||
s := findScraper(scraperID)
|
s := c.findScraper(scraperID)
|
||||||
if s != nil {
|
if s != nil {
|
||||||
return s.ScrapePerformerNames(query)
|
return s.ScrapePerformerNames(query, c.globalConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
// ScrapePerformer uses the scraper with the provided ID to scrape a
|
||||||
|
// performer using the provided performer fragment.
|
||||||
|
func (c Cache) ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||||
// find scraper with the provided id
|
// find scraper with the provided id
|
||||||
s := findScraper(scraperID)
|
s := c.findScraper(scraperID)
|
||||||
if s != nil {
|
if s != nil {
|
||||||
ret, err := s.ScrapePerformer(scrapedPerformer)
|
ret, err := s.ScrapePerformer(scrapedPerformer, c.globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// post-process - set the image if applicable
|
// post-process - set the image if applicable
|
||||||
if err := setPerformerImage(ret); err != nil {
|
if err := setPerformerImage(ret, c.globalConfig); err != nil {
|
||||||
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,16 +165,19 @@ func ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerI
|
|||||||
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
|
// ScrapePerformerURL uses the first scraper it finds that matches the URL
|
||||||
for _, s := range scrapers {
|
// provided to scrape a performer. If no scrapers are found that matches
|
||||||
|
// the URL, then nil is returned.
|
||||||
|
func (c Cache) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
|
||||||
|
for _, s := range c.scrapers {
|
||||||
if s.matchesPerformerURL(url) {
|
if s.matchesPerformerURL(url) {
|
||||||
ret, err := s.ScrapePerformerURL(url)
|
ret, err := s.ScrapePerformerURL(url, c.globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// post-process - set the image if applicable
|
// post-process - set the image if applicable
|
||||||
if err := setPerformerImage(ret); err != nil {
|
if err := setPerformerImage(ret, c.globalConfig); err != nil {
|
||||||
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -194,6 +225,7 @@ func matchStudio(s *models.ScrapedSceneStudio) error {
|
|||||||
s.ID = &id
|
s.ID = &id
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func matchMovie(m *models.ScrapedSceneMovie) error {
|
func matchMovie(m *models.ScrapedSceneMovie) error {
|
||||||
qb := models.NewMovieQueryBuilder()
|
qb := models.NewMovieQueryBuilder()
|
||||||
|
|
||||||
@@ -232,7 +264,7 @@ func matchTag(s *models.ScrapedSceneTag) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func postScrapeScene(ret *models.ScrapedScene) error {
|
func (c Cache) postScrapeScene(ret *models.ScrapedScene) error {
|
||||||
for _, p := range ret.Performers {
|
for _, p := range ret.Performers {
|
||||||
err := matchPerformer(p)
|
err := matchPerformer(p)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -262,25 +294,26 @@ func postScrapeScene(ret *models.ScrapedScene) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// post-process - set the image if applicable
|
// post-process - set the image if applicable
|
||||||
if err := setSceneImage(ret); err != nil {
|
if err := setSceneImage(ret, c.globalConfig); err != nil {
|
||||||
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
// ScrapeScene uses the scraper with the provided ID to scrape a scene.
|
||||||
|
func (c Cache) ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||||
// find scraper with the provided id
|
// find scraper with the provided id
|
||||||
s := findScraper(scraperID)
|
s := c.findScraper(scraperID)
|
||||||
if s != nil {
|
if s != nil {
|
||||||
ret, err := s.ScrapeScene(scene)
|
ret, err := s.ScrapeScene(scene, c.globalConfig)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if ret != nil {
|
if ret != nil {
|
||||||
err = postScrapeScene(ret)
|
err = c.postScrapeScene(ret)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -292,16 +325,19 @@ func ScrapeScene(scraperID string, scene models.SceneUpdateInput) (*models.Scrap
|
|||||||
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
return nil, errors.New("Scraper with ID " + scraperID + " not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
|
// ScrapeSceneURL uses the first scraper it finds that matches the URL
|
||||||
for _, s := range scrapers {
|
// provided to scrape a scene. If no scrapers are found that matches
|
||||||
|
// the URL, then nil is returned.
|
||||||
|
func (c Cache) ScrapeSceneURL(url string) (*models.ScrapedScene, error) {
|
||||||
|
for _, s := range c.scrapers {
|
||||||
if s.matchesSceneURL(url) {
|
if s.matchesSceneURL(url) {
|
||||||
ret, err := s.ScrapeSceneURL(url)
|
ret, err := s.ScrapeSceneURL(url, c.globalConfig)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = postScrapeScene(ret)
|
err = c.postScrapeScene(ret)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,16 +6,32 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
func runScraperScript(command []string, inString string, out interface{}) error {
|
type scriptScraper struct {
|
||||||
|
scraper scraperTypeConfig
|
||||||
|
config config
|
||||||
|
globalConfig GlobalConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper {
|
||||||
|
return &scriptScraper{
|
||||||
|
scraper: scraper,
|
||||||
|
config: config,
|
||||||
|
globalConfig: globalConfig,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *scriptScraper) runScraperScript(inString string, out interface{}) error {
|
||||||
|
command := s.scraper.Script
|
||||||
|
|
||||||
cmd := exec.Command(command[0], command[1:]...)
|
cmd := exec.Command(command[0], command[1:]...)
|
||||||
cmd.Dir = config.GetScrapersPath()
|
cmd.Dir = filepath.Dir(s.config.path)
|
||||||
|
|
||||||
stdin, err := cmd.StdinPipe()
|
stdin, err := cmd.StdinPipe()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -65,12 +81,12 @@ func runScraperScript(command []string, inString string, out interface{}) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerNamesScript(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
|
func (s *scriptScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
|
||||||
inString := `{"name": "` + name + `"}`
|
inString := `{"name": "` + name + `"}`
|
||||||
|
|
||||||
var performers []models.ScrapedPerformer
|
var performers []models.ScrapedPerformer
|
||||||
|
|
||||||
err := runScraperScript(c.Script, inString, &performers)
|
err := s.runScraperScript(inString, &performers)
|
||||||
|
|
||||||
// convert to pointers
|
// convert to pointers
|
||||||
var ret []*models.ScrapedPerformer
|
var ret []*models.ScrapedPerformer
|
||||||
@@ -83,7 +99,7 @@ func scrapePerformerNamesScript(c scraperTypeConfig, name string) ([]*models.Scr
|
|||||||
return ret, err
|
return ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerFragmentScript(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
func (s *scriptScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||||
inString, err := json.Marshal(scrapedPerformer)
|
inString, err := json.Marshal(scrapedPerformer)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -92,22 +108,22 @@ func scrapePerformerFragmentScript(c scraperTypeConfig, scrapedPerformer models.
|
|||||||
|
|
||||||
var ret models.ScrapedPerformer
|
var ret models.ScrapedPerformer
|
||||||
|
|
||||||
err = runScraperScript(c.Script, string(inString), &ret)
|
err = s.runScraperScript(string(inString), &ret)
|
||||||
|
|
||||||
return &ret, err
|
return &ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerURLScript(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
func (s *scriptScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
|
||||||
inString := `{"url": "` + url + `"}`
|
inString := `{"url": "` + url + `"}`
|
||||||
|
|
||||||
var ret models.ScrapedPerformer
|
var ret models.ScrapedPerformer
|
||||||
|
|
||||||
err := runScraperScript(c.Script, string(inString), &ret)
|
err := s.runScraperScript(string(inString), &ret)
|
||||||
|
|
||||||
return &ret, err
|
return &ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeSceneFragmentScript(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
func (s *scriptScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||||
inString, err := json.Marshal(scene)
|
inString, err := json.Marshal(scene)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -116,17 +132,17 @@ func scrapeSceneFragmentScript(c scraperTypeConfig, scene models.SceneUpdateInpu
|
|||||||
|
|
||||||
var ret models.ScrapedScene
|
var ret models.ScrapedScene
|
||||||
|
|
||||||
err = runScraperScript(c.Script, string(inString), &ret)
|
err = s.runScraperScript(string(inString), &ret)
|
||||||
|
|
||||||
return &ret, err
|
return &ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeSceneURLScript(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
|
func (s *scriptScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
|
||||||
inString := `{"url": "` + url + `"}`
|
inString := `{"url": "` + url + `"}`
|
||||||
|
|
||||||
var ret models.ScrapedScene
|
var ret models.ScrapedScene
|
||||||
|
|
||||||
err := runScraperScript(c.Script, string(inString), &ret)
|
err := s.runScraperScript(string(inString), &ret)
|
||||||
|
|
||||||
return &ret, err
|
return &ret, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package scraper
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
"github.com/jinzhu/copier"
|
"github.com/jinzhu/copier"
|
||||||
@@ -10,8 +11,22 @@ import (
|
|||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
func getStashClient(c scraperTypeConfig) *graphql.Client {
|
type stashScraper struct {
|
||||||
url := c.scraperConfig.StashServer.URL
|
scraper scraperTypeConfig
|
||||||
|
config config
|
||||||
|
globalConfig GlobalConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
func newStashScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *stashScraper {
|
||||||
|
return &stashScraper{
|
||||||
|
scraper: scraper,
|
||||||
|
config: config,
|
||||||
|
globalConfig: globalConfig,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stashScraper) getStashClient() *graphql.Client {
|
||||||
|
url := s.config.StashServer.URL
|
||||||
return graphql.NewClient(url+"/graphql", nil)
|
return graphql.NewClient(url+"/graphql", nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -33,8 +48,8 @@ type stashFindPerformerNamesResultType struct {
|
|||||||
Performers []*stashFindPerformerNamePerformer `graphql:"performers"`
|
Performers []*stashFindPerformerNamePerformer `graphql:"performers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerNamesStash(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
|
func (s *stashScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
|
||||||
client := getStashClient(c)
|
client := s.getStashClient()
|
||||||
|
|
||||||
var q struct {
|
var q struct {
|
||||||
FindPerformers stashFindPerformerNamesResultType `graphql:"findPerformers(filter: $f)"`
|
FindPerformers stashFindPerformerNamesResultType `graphql:"findPerformers(filter: $f)"`
|
||||||
@@ -64,8 +79,8 @@ func scrapePerformerNamesStash(c scraperTypeConfig, name string) ([]*models.Scra
|
|||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
func (s *stashScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||||
client := getStashClient(c)
|
client := s.getStashClient()
|
||||||
|
|
||||||
var q struct {
|
var q struct {
|
||||||
FindPerformer *models.ScrapedPerformerStash `graphql:"findPerformer(id: $f)"`
|
FindPerformer *models.ScrapedPerformerStash `graphql:"findPerformer(id: $f)"`
|
||||||
@@ -91,7 +106,7 @@ func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.S
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get the performer image directly
|
// get the performer image directly
|
||||||
ret.Image, err = getStashPerformerImage(c.scraperConfig.StashServer.URL, performerID)
|
ret.Image, err = getStashPerformerImage(s.config.StashServer.URL, performerID, s.globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -99,7 +114,7 @@ func scrapePerformerFragmentStash(c scraperTypeConfig, scrapedPerformer models.S
|
|||||||
return &ret, nil
|
return &ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
func (s *stashScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||||
// query by MD5
|
// query by MD5
|
||||||
// assumes that the scene exists in the database
|
// assumes that the scene exists in the database
|
||||||
qb := models.NewSceneQueryBuilder()
|
qb := models.NewSceneQueryBuilder()
|
||||||
@@ -123,7 +138,7 @@ func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput
|
|||||||
"c": &checksum,
|
"c": &checksum,
|
||||||
}
|
}
|
||||||
|
|
||||||
client := getStashClient(c)
|
client := s.getStashClient()
|
||||||
err = client.Query(context.Background(), &q, vars)
|
err = client.Query(context.Background(), &q, vars)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -152,10 +167,18 @@ func scrapeSceneFragmentStash(c scraperTypeConfig, scene models.SceneUpdateInput
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get the performer image directly
|
// get the performer image directly
|
||||||
ret.Image, err = getStashSceneImage(c.scraperConfig.StashServer.URL, q.FindScene.ID)
|
ret.Image, err = getStashSceneImage(s.config.StashServer.URL, q.FindScene.ID, s.globalConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return &ret, nil
|
return &ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *stashScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
|
||||||
|
return nil, errors.New("scrapePerformerByURL not supported for stash scraper")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stashScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
|
||||||
|
return nil, errors.New("scrapeSceneByURL not supported for stash scraper")
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/http/cookiejar"
|
"net/http/cookiejar"
|
||||||
"net/url"
|
"net/url"
|
||||||
"reflect"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -17,7 +16,6 @@ import (
|
|||||||
"golang.org/x/net/publicsuffix"
|
"golang.org/x/net/publicsuffix"
|
||||||
|
|
||||||
"github.com/stashapp/stash/pkg/logger"
|
"github.com/stashapp/stash/pkg/logger"
|
||||||
"github.com/stashapp/stash/pkg/manager/config"
|
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -25,572 +23,94 @@ import (
|
|||||||
// configurable at some point.
|
// configurable at some point.
|
||||||
const scrapeGetTimeout = time.Second * 30
|
const scrapeGetTimeout = time.Second * 30
|
||||||
|
|
||||||
var debugMode = false
|
|
||||||
|
|
||||||
type commonXPathConfig map[string]string
|
|
||||||
|
|
||||||
func (c commonXPathConfig) applyCommon(src string) string {
|
|
||||||
ret := src
|
|
||||||
for commonKey, commonVal := range c {
|
|
||||||
if strings.Contains(ret, commonKey) {
|
|
||||||
ret = strings.Replace(ret, commonKey, commonVal, -1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
type xpathScraperConfig map[string]interface{}
|
|
||||||
|
|
||||||
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
|
|
||||||
ret := make(xpathScraperConfig)
|
|
||||||
|
|
||||||
if src != nil {
|
|
||||||
for k, v := range src {
|
|
||||||
keyStr, isStr := k.(string)
|
|
||||||
if isStr {
|
|
||||||
ret[keyStr] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
type xpathRegexConfig map[interface{}]interface{}
|
|
||||||
type xpathRegexConfigs []xpathRegexConfig
|
|
||||||
|
|
||||||
func (c xpathRegexConfig) apply(value string) string {
|
|
||||||
regex := ""
|
|
||||||
with := ""
|
|
||||||
|
|
||||||
if regexI, _ := c["regex"]; regexI != nil {
|
|
||||||
regex, _ = regexI.(string)
|
|
||||||
}
|
|
||||||
if withI, _ := c["with"]; withI != nil {
|
|
||||||
with, _ = withI.(string)
|
|
||||||
}
|
|
||||||
|
|
||||||
if regex != "" {
|
|
||||||
re, err := regexp.Compile(regex)
|
|
||||||
if err != nil {
|
|
||||||
logger.Warnf("Error compiling regex '%s': %s", regex, err.Error())
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
ret := re.ReplaceAllString(value, with)
|
|
||||||
// replace lines if needed to protect from commonPostprocess
|
|
||||||
if with == "\n" {
|
|
||||||
ret = replaceLines(ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
|
|
||||||
logger.Debugf("Before: %s", value)
|
|
||||||
logger.Debugf("After: %s", ret)
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathRegexConfigs) apply(value string) string {
|
|
||||||
// apply regex in order
|
|
||||||
for _, config := range c {
|
|
||||||
value = config.apply(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove whitespace again
|
|
||||||
value = commonPostProcess(value)
|
|
||||||
|
|
||||||
// restore replaced lines
|
|
||||||
|
|
||||||
value = restoreLines(value)
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
type xpathScraperAttrConfig map[interface{}]interface{}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getString(key string) string {
|
|
||||||
ret, _ := c[key]
|
|
||||||
|
|
||||||
if ret == nil {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
asStr, _ := ret.(string)
|
|
||||||
return asStr
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getSelector() string {
|
|
||||||
const selectorKey = "selector"
|
|
||||||
return c.getString(selectorKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getConcat() string {
|
|
||||||
const concatKey = "concat"
|
|
||||||
return c.getString(concatKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) hasConcat() bool {
|
|
||||||
return c.getConcat() != ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getParseDate() string {
|
|
||||||
const parseDateKey = "parseDate"
|
|
||||||
return c.getString(parseDateKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getSplit() string {
|
|
||||||
const splitKey = "split"
|
|
||||||
return c.getString(splitKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) hasSplit() bool {
|
|
||||||
return c.getSplit() != ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
|
|
||||||
const replaceKey = "replace"
|
|
||||||
val, _ := c[replaceKey]
|
|
||||||
|
|
||||||
var ret xpathRegexConfigs
|
|
||||||
if val == nil {
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
asSlice, _ := val.([]interface{})
|
|
||||||
|
|
||||||
for _, v := range asSlice {
|
|
||||||
asMap, _ := v.(map[interface{}]interface{})
|
|
||||||
ret = append(ret, xpathRegexConfig(asMap))
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) getSubScraper() xpathScraperAttrConfig {
|
|
||||||
const subScraperKey = "subScraper"
|
|
||||||
val, _ := c[subScraperKey]
|
|
||||||
|
|
||||||
if val == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
asMap, _ := val.(map[interface{}]interface{})
|
|
||||||
if asMap != nil {
|
|
||||||
return xpathScraperAttrConfig(asMap)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string {
|
|
||||||
separator := c.getConcat()
|
|
||||||
result := []string{}
|
|
||||||
|
|
||||||
for _, elem := range nodes {
|
|
||||||
text := NodeText(elem)
|
|
||||||
text = commonPostProcess(text)
|
|
||||||
|
|
||||||
result = append(result, text)
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(result, separator)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) parseDate(value string) string {
|
|
||||||
parseDate := c.getParseDate()
|
|
||||||
|
|
||||||
if parseDate == "" {
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
// try to parse the date using the pattern
|
|
||||||
// if it fails, then just fall back to the original value
|
|
||||||
parsedValue, err := time.Parse(parseDate, value)
|
|
||||||
if err != nil {
|
|
||||||
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert it into our date format
|
|
||||||
const internalDateFormat = "2006-01-02"
|
|
||||||
return parsedValue.Format(internalDateFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) splitString(value string) []string {
|
|
||||||
separator := c.getSplit()
|
|
||||||
var res []string
|
|
||||||
|
|
||||||
if separator == "" {
|
|
||||||
return []string{value}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, str := range strings.Split(value, separator) {
|
|
||||||
if str != "" {
|
|
||||||
res = append(res, str)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
// setKeyAndSplit sets the key "k" for the results "ret" and splits if needed
|
|
||||||
// "i" is the index starting position
|
|
||||||
func (c xpathScraperAttrConfig) setKeyAndSplit(ret *xPathResults, value string, k string, i int) {
|
|
||||||
if c.hasSplit() {
|
|
||||||
for j, txt := range c.splitString(value) {
|
|
||||||
*ret = ret.setKey(j+i, k, txt)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
*ret = ret.setKey(i, k, value)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) replaceRegex(value string) string {
|
|
||||||
replace := c.getReplace()
|
|
||||||
return replace.apply(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) applySubScraper(value string) string {
|
|
||||||
subScraper := c.getSubScraper()
|
|
||||||
|
|
||||||
if subScraper == nil {
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Debugf("Sub-scraping for: %s", value)
|
|
||||||
doc, err := loadURL(value)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
found := runXPathQuery(doc, subScraper.getSelector(), nil)
|
|
||||||
|
|
||||||
if len(found) > 0 {
|
|
||||||
// check if we're concatenating the results into a single result
|
|
||||||
var result string
|
|
||||||
if subScraper.hasConcat() {
|
|
||||||
result = subScraper.concatenateResults(found)
|
|
||||||
} else {
|
|
||||||
result = NodeText(found[0])
|
|
||||||
result = commonPostProcess(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
result = subScraper.postProcess(result)
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c xpathScraperAttrConfig) postProcess(value string) string {
|
|
||||||
// perform regex replacements first
|
|
||||||
value = c.replaceRegex(value)
|
|
||||||
value = c.applySubScraper(value)
|
|
||||||
value = c.parseDate(value)
|
|
||||||
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
func commonPostProcess(value string) string {
|
|
||||||
value = strings.TrimSpace(value)
|
|
||||||
|
|
||||||
// remove multiple whitespace and end lines
|
|
||||||
re := regexp.MustCompile("\n")
|
|
||||||
value = re.ReplaceAllString(value, "")
|
|
||||||
re = regexp.MustCompile(" +")
|
|
||||||
value = re.ReplaceAllString(value, " ")
|
|
||||||
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
// func replaceLines replaces all newlines ("\n") with alert ("\a")
|
|
||||||
func replaceLines(value string) string {
|
|
||||||
re := regexp.MustCompile("\a") // \a shouldn't exist in the string
|
|
||||||
value = re.ReplaceAllString(value, "") // remove it
|
|
||||||
re = regexp.MustCompile("\n") // replace newlines with (\a)'s so that they don't get removed by commonPostprocess
|
|
||||||
value = re.ReplaceAllString(value, "\a")
|
|
||||||
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
// func restoreLines replaces all alerts ("\a") with newlines ("\n")
|
|
||||||
func restoreLines(value string) string {
|
|
||||||
re := regexp.MustCompile("\a")
|
|
||||||
value = re.ReplaceAllString(value, "\n")
|
|
||||||
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
|
|
||||||
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
|
|
||||||
// apply common
|
|
||||||
if common != nil {
|
|
||||||
xpath = common.applyCommon(xpath)
|
|
||||||
}
|
|
||||||
|
|
||||||
found, err := htmlquery.QueryAll(doc, xpath)
|
|
||||||
if err != nil {
|
|
||||||
logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error())
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return found
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults {
|
|
||||||
var ret xPathResults
|
|
||||||
|
|
||||||
for k, value := range s {
|
|
||||||
switch v := value.(type) {
|
|
||||||
case string:
|
|
||||||
found := runXPathQuery(doc, v, common)
|
|
||||||
|
|
||||||
if len(found) > 0 {
|
|
||||||
for i, elem := range found {
|
|
||||||
text := NodeText(elem)
|
|
||||||
text = commonPostProcess(text)
|
|
||||||
|
|
||||||
ret = ret.setKey(i, k, text)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case map[interface{}]interface{}:
|
|
||||||
attrConfig := xpathScraperAttrConfig(v)
|
|
||||||
|
|
||||||
found := runXPathQuery(doc, attrConfig.getSelector(), common)
|
|
||||||
|
|
||||||
if len(found) > 0 {
|
|
||||||
// check if we're concatenating the results into a single result
|
|
||||||
if attrConfig.hasConcat() {
|
|
||||||
result := attrConfig.concatenateResults(found)
|
|
||||||
result = attrConfig.postProcess(result)
|
|
||||||
attrConfig.setKeyAndSplit(&ret, result, k, 0)
|
|
||||||
} else {
|
|
||||||
for i, elem := range found {
|
|
||||||
text := NodeText(elem)
|
|
||||||
text = commonPostProcess(text)
|
|
||||||
text = attrConfig.postProcess(text)
|
|
||||||
attrConfig.setKeyAndSplit(&ret, text, k, i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
type xpathScrapers map[string]*xpathScraper
|
|
||||||
|
|
||||||
type xpathScraper struct {
|
type xpathScraper struct {
|
||||||
Common commonXPathConfig `yaml:"common"`
|
scraper scraperTypeConfig
|
||||||
Scene xpathScraperConfig `yaml:"scene"`
|
config config
|
||||||
Performer xpathScraperConfig `yaml:"performer"`
|
globalConfig GlobalConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
func newXpathScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *xpathScraper {
|
||||||
XPathScraperConfigSceneTags = "Tags"
|
return &xpathScraper{
|
||||||
XPathScraperConfigScenePerformers = "Performers"
|
scraper: scraper,
|
||||||
XPathScraperConfigSceneStudio = "Studio"
|
config: config,
|
||||||
XPathScraperConfigSceneMovies = "Movies"
|
globalConfig: globalConfig,
|
||||||
)
|
|
||||||
|
|
||||||
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
|
|
||||||
// exclude the complex sub-configs
|
|
||||||
ret := make(xpathScraperConfig)
|
|
||||||
mapped := s.Scene
|
|
||||||
|
|
||||||
if mapped != nil {
|
|
||||||
for k, v := range mapped {
|
|
||||||
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio && k != XPathScraperConfigSceneMovies {
|
|
||||||
ret[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
|
|
||||||
var ret map[interface{}]interface{}
|
|
||||||
mapped := s.Scene
|
|
||||||
|
|
||||||
if mapped != nil {
|
|
||||||
v, ok := mapped[key]
|
|
||||||
if ok {
|
|
||||||
ret, _ = v.(map[interface{}]interface{})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ret != nil {
|
|
||||||
return createXPathScraperConfig(ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
|
|
||||||
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
|
|
||||||
return s.getSceneSubMap(XPathScraperConfigSceneTags)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
|
|
||||||
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) GetSceneMovies() xpathScraperConfig {
|
|
||||||
return s.getSceneSubMap(XPathScraperConfigSceneMovies)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
|
|
||||||
var ret models.ScrapedPerformer
|
|
||||||
|
|
||||||
performerMap := s.Performer
|
|
||||||
if performerMap == nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
results := performerMap.process(doc, s.Common)
|
|
||||||
if len(results) > 0 {
|
|
||||||
results[0].apply(&ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) {
|
|
||||||
var ret []*models.ScrapedPerformer
|
|
||||||
|
|
||||||
performerMap := s.Performer
|
|
||||||
if performerMap == nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
results := performerMap.process(doc, s.Common)
|
|
||||||
for _, r := range results {
|
|
||||||
var p models.ScrapedPerformer
|
|
||||||
r.apply(&p)
|
|
||||||
ret = append(ret, &p)
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
|
|
||||||
var ret models.ScrapedScene
|
|
||||||
|
|
||||||
sceneMap := s.GetSceneSimple()
|
|
||||||
if sceneMap == nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
scenePerformersMap := s.GetScenePerformers()
|
|
||||||
sceneTagsMap := s.GetSceneTags()
|
|
||||||
sceneStudioMap := s.GetSceneStudio()
|
|
||||||
sceneMoviesMap := s.GetSceneMovies()
|
|
||||||
|
|
||||||
logger.Debug(`Processing scene:`)
|
|
||||||
results := sceneMap.process(doc, s.Common)
|
|
||||||
if len(results) > 0 {
|
|
||||||
results[0].apply(&ret)
|
|
||||||
|
|
||||||
// now apply the performers and tags
|
|
||||||
if scenePerformersMap != nil {
|
|
||||||
logger.Debug(`Processing scene performers:`)
|
|
||||||
performerResults := scenePerformersMap.process(doc, s.Common)
|
|
||||||
|
|
||||||
for _, p := range performerResults {
|
|
||||||
performer := &models.ScrapedScenePerformer{}
|
|
||||||
p.apply(performer)
|
|
||||||
ret.Performers = append(ret.Performers, performer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if sceneTagsMap != nil {
|
|
||||||
logger.Debug(`Processing scene tags:`)
|
|
||||||
tagResults := sceneTagsMap.process(doc, s.Common)
|
|
||||||
|
|
||||||
for _, p := range tagResults {
|
|
||||||
tag := &models.ScrapedSceneTag{}
|
|
||||||
p.apply(tag)
|
|
||||||
ret.Tags = append(ret.Tags, tag)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if sceneStudioMap != nil {
|
|
||||||
logger.Debug(`Processing scene studio:`)
|
|
||||||
studioResults := sceneStudioMap.process(doc, s.Common)
|
|
||||||
|
|
||||||
if len(studioResults) > 0 {
|
|
||||||
studio := &models.ScrapedSceneStudio{}
|
|
||||||
studioResults[0].apply(studio)
|
|
||||||
ret.Studio = studio
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if sceneMoviesMap != nil {
|
|
||||||
logger.Debug(`Processing scene movies:`)
|
|
||||||
movieResults := sceneMoviesMap.process(doc, s.Common)
|
|
||||||
|
|
||||||
for _, p := range movieResults {
|
|
||||||
movie := &models.ScrapedSceneMovie{}
|
|
||||||
p.apply(movie)
|
|
||||||
ret.Movies = append(ret.Movies, movie)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return &ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type xPathResult map[string]string
|
|
||||||
type xPathResults []xPathResult
|
|
||||||
|
|
||||||
func (r xPathResult) apply(dest interface{}) {
|
|
||||||
destVal := reflect.ValueOf(dest)
|
|
||||||
|
|
||||||
// dest should be a pointer
|
|
||||||
destVal = destVal.Elem()
|
|
||||||
|
|
||||||
for key, value := range r {
|
|
||||||
field := destVal.FieldByName(key)
|
|
||||||
|
|
||||||
if field.IsValid() {
|
|
||||||
var reflectValue reflect.Value
|
|
||||||
if field.Kind() == reflect.Ptr {
|
|
||||||
// need to copy the value, otherwise everything is set to the
|
|
||||||
// same pointer
|
|
||||||
localValue := value
|
|
||||||
reflectValue = reflect.ValueOf(&localValue)
|
|
||||||
} else {
|
|
||||||
reflectValue = reflect.ValueOf(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
field.Set(reflectValue)
|
|
||||||
} else {
|
|
||||||
logger.Errorf("Field %s does not exist in %T", key, dest)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r xPathResults) setKey(index int, key string, value string) xPathResults {
|
func (s *xpathScraper) getXpathScraper() *mappedScraper {
|
||||||
if index >= len(r) {
|
return s.config.XPathScrapers[s.scraper.Scraper]
|
||||||
r = append(r, make(xPathResult))
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Debugf(`[%d][%s] = %s`, index, key, value)
|
|
||||||
r[index][key] = value
|
|
||||||
return r
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadURL(url string) (*html.Node, error) {
|
func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) {
|
||||||
|
scraper := s.getXpathScraper()
|
||||||
|
|
||||||
|
if scraper == nil {
|
||||||
|
return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
|
||||||
|
}
|
||||||
|
|
||||||
|
doc, err := s.loadURL(url)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return doc, scraper, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
|
||||||
|
doc, scraper, err := s.scrapeURL(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := s.getXPathQuery(doc)
|
||||||
|
return scraper.scrapePerformer(q)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
|
||||||
|
doc, scraper, err := s.scrapeURL(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := s.getXPathQuery(doc)
|
||||||
|
return scraper.scrapeScene(q)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
|
||||||
|
scraper := s.getXpathScraper()
|
||||||
|
|
||||||
|
if scraper == nil {
|
||||||
|
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
|
||||||
|
}
|
||||||
|
|
||||||
|
const placeholder = "{}"
|
||||||
|
|
||||||
|
// replace the placeholder string with the URL-escaped name
|
||||||
|
escapedName := url.QueryEscape(name)
|
||||||
|
|
||||||
|
url := s.scraper.QueryURL
|
||||||
|
url = strings.Replace(url, placeholder, escapedName, -1)
|
||||||
|
|
||||||
|
doc, err := s.loadURL(url)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := s.getXPathQuery(doc)
|
||||||
|
return scraper.scrapePerformers(q)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||||
|
return nil, errors.New("scrapePerformerByFragment not supported for xpath scraper")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||||
|
return nil, errors.New("scrapeSceneByFragment not supported for xpath scraper")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *xpathScraper) loadURL(url string) (*html.Node, error) {
|
||||||
options := cookiejar.Options{
|
options := cookiejar.Options{
|
||||||
PublicSuffixList: publicsuffix.List,
|
PublicSuffixList: publicsuffix.List,
|
||||||
}
|
}
|
||||||
@@ -615,7 +135,7 @@ func loadURL(url string) (*html.Node, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
userAgent := config.GetScraperUserAgent()
|
userAgent := s.globalConfig.UserAgent
|
||||||
if userAgent != "" {
|
if userAgent != "" {
|
||||||
req.Header.Set("User-Agent", userAgent)
|
req.Header.Set("User-Agent", userAgent)
|
||||||
}
|
}
|
||||||
@@ -633,7 +153,7 @@ func loadURL(url string) (*html.Node, error) {
|
|||||||
|
|
||||||
ret, err := html.Parse(r)
|
ret, err := html.Parse(r)
|
||||||
|
|
||||||
if err == nil && debugMode {
|
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, ret)
|
html.Render(&b, ret)
|
||||||
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
|
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
|
||||||
@@ -642,73 +162,65 @@ func loadURL(url string) (*html.Node, error) {
|
|||||||
return ret, err
|
return ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
|
||||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
return &xpathQuery{
|
||||||
|
doc: doc,
|
||||||
if scraper == nil {
|
scraper: s,
|
||||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.scraperConfig != nil && c.scraperConfig.DebugOptions != nil && c.scraperConfig.DebugOptions.PrintHTML {
|
|
||||||
debugMode = true
|
|
||||||
}
|
|
||||||
|
|
||||||
doc, err := loadURL(url)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return scraper.scrapePerformer(doc)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
|
type xpathQuery struct {
|
||||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
doc *html.Node
|
||||||
|
scraper *xpathScraper
|
||||||
if scraper == nil {
|
|
||||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.scraperConfig != nil && c.scraperConfig.DebugOptions != nil && c.scraperConfig.DebugOptions.PrintHTML {
|
|
||||||
debugMode = true
|
|
||||||
}
|
|
||||||
|
|
||||||
doc, err := loadURL(url)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return scraper.scrapeScene(doc)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
|
func (q *xpathQuery) runQuery(selector string) []string {
|
||||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
found, err := htmlquery.QueryAll(q.doc, selector)
|
||||||
|
|
||||||
if scraper == nil {
|
|
||||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
|
||||||
}
|
|
||||||
|
|
||||||
const placeholder = "{}"
|
|
||||||
|
|
||||||
// replace the placeholder string with the URL-escaped name
|
|
||||||
escapedName := url.QueryEscape(name)
|
|
||||||
|
|
||||||
u := c.QueryURL
|
|
||||||
u = strings.Replace(u, placeholder, escapedName, -1)
|
|
||||||
|
|
||||||
doc, err := loadURL(u)
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
logger.Warnf("Error parsing xpath expression '%s': %s", selector, err.Error())
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return scraper.scrapePerformers(doc)
|
var ret []string
|
||||||
|
for _, n := range found {
|
||||||
|
// don't add empty strings
|
||||||
|
nodeText := q.nodeText(n)
|
||||||
|
if nodeText != "" {
|
||||||
|
ret = append(ret, q.nodeText(n))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func NodeText(n *html.Node) string {
|
func (q *xpathQuery) nodeText(n *html.Node) string {
|
||||||
|
var ret string
|
||||||
if n != nil && n.Type == html.CommentNode {
|
if n != nil && n.Type == html.CommentNode {
|
||||||
return htmlquery.OutputHTML(n, true)
|
ret = htmlquery.OutputHTML(n, true)
|
||||||
}
|
}
|
||||||
return htmlquery.InnerText(n)
|
ret = htmlquery.InnerText(n)
|
||||||
|
|
||||||
|
// trim all leading and trailing whitespace
|
||||||
|
ret = strings.TrimSpace(ret)
|
||||||
|
|
||||||
|
// remove multiple whitespace
|
||||||
|
re := regexp.MustCompile(" +")
|
||||||
|
ret = re.ReplaceAllString(ret, " ")
|
||||||
|
|
||||||
|
// TODO - make this optional
|
||||||
|
re = regexp.MustCompile("\n")
|
||||||
|
ret = re.ReplaceAllString(ret, "")
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *xpathQuery) subScrape(value string) mappedQuery {
|
||||||
|
doc, err := q.scraper.loadURL(value)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return q.scraper.getXPathQuery(doc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,15 @@
|
|||||||
package scraper
|
package scraper
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/antchfx/htmlquery"
|
"github.com/antchfx/htmlquery"
|
||||||
"github.com/stashapp/stash/pkg/models"
|
"github.com/stashapp/stash/pkg/models"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
"gopkg.in/yaml.v2"
|
"gopkg.in/yaml.v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -183,49 +187,79 @@ func makeCommonXPath(attr string) string {
|
|||||||
return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
|
return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeReplaceRegex(regex string, with string) map[interface{}]interface{} {
|
func makeSimpleAttrConfig(str string) mappedScraperAttrConfig {
|
||||||
ret := make(map[interface{}]interface{})
|
return mappedScraperAttrConfig{
|
||||||
|
Selector: str,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeReplaceRegex(regex string, with string) mappedRegexConfig {
|
||||||
|
ret := mappedRegexConfig{
|
||||||
|
Regex: regex,
|
||||||
|
With: with,
|
||||||
|
}
|
||||||
|
|
||||||
ret["regex"] = regex
|
|
||||||
ret["with"] = with
|
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeXPathConfig() xpathScraperConfig {
|
func makeXPathConfig() mappedPerformerScraperConfig {
|
||||||
config := make(xpathScraperConfig)
|
config := mappedPerformerScraperConfig{
|
||||||
|
mappedConfig: make(mappedConfig),
|
||||||
|
}
|
||||||
|
|
||||||
config["Name"] = makeCommonXPath("Babe Name:") + `/a`
|
config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`)
|
||||||
config["Ethnicity"] = makeCommonXPath("Ethnicity:")
|
config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:"))
|
||||||
config["Country"] = makeCommonXPath("Country of Origin:")
|
config.mappedConfig["Country"] = makeSimpleAttrConfig(makeCommonXPath("Country of Origin:"))
|
||||||
config["Aliases"] = makeCommonXPath("Aliases:")
|
config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:"))
|
||||||
config["EyeColor"] = makeCommonXPath("Eye Color:")
|
config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:"))
|
||||||
config["Measurements"] = makeCommonXPath("Measurements:")
|
config.mappedConfig["Measurements"] = makeSimpleAttrConfig(makeCommonXPath("Measurements:"))
|
||||||
config["FakeTits"] = makeCommonXPath("Fake boobs:")
|
config.mappedConfig["FakeTits"] = makeSimpleAttrConfig(makeCommonXPath("Fake boobs:"))
|
||||||
config["Height"] = makeCommonXPath("Height:")
|
config.mappedConfig["Height"] = makeSimpleAttrConfig(makeCommonXPath("Height:"))
|
||||||
config["Tattoos"] = makeCommonXPath("Tattoos:")
|
config.mappedConfig["Tattoos"] = makeSimpleAttrConfig(makeCommonXPath("Tattoos:"))
|
||||||
config["Piercings"] = makeCommonXPath("Piercings:")
|
config.mappedConfig["Piercings"] = makeSimpleAttrConfig(makeCommonXPath("Piercings:"))
|
||||||
|
|
||||||
// special handling for birthdate
|
// special handling for birthdate
|
||||||
birthdateAttrConfig := make(map[interface{}]interface{})
|
birthdateAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Date of Birth:"))
|
||||||
birthdateAttrConfig["selector"] = makeCommonXPath("Date of Birth:")
|
|
||||||
|
|
||||||
var birthdateReplace []interface{}
|
var birthdateReplace mappedRegexConfigs
|
||||||
birthdateReplace = append(birthdateReplace, makeReplaceRegex(` \(.* years old\)`, ""))
|
// make this leave the trailing space to test existing scrapers that do so
|
||||||
|
birthdateReplace = append(birthdateReplace, makeReplaceRegex(`\(.* years old\)`, ""))
|
||||||
|
|
||||||
birthdateAttrConfig["replace"] = birthdateReplace
|
birthdateReplaceAction := postProcessReplace(birthdateReplace)
|
||||||
birthdateAttrConfig["parseDate"] = "January 2, 2006" // "July 1, 1992 (27 years old) "
|
birthdateParseDate := postProcessParseDate("January 2, 2006") // "July 1, 1992 (27 years old) "
|
||||||
config["Birthdate"] = birthdateAttrConfig
|
birthdateAttrConfig.postProcessActions = []postProcessAction{
|
||||||
|
&birthdateReplaceAction,
|
||||||
|
&birthdateParseDate,
|
||||||
|
}
|
||||||
|
config.mappedConfig["Birthdate"] = birthdateAttrConfig
|
||||||
|
|
||||||
// special handling for career length
|
// special handling for career length
|
||||||
careerLengthAttrConfig := make(map[interface{}]interface{})
|
|
||||||
// no colon in attribute header
|
// no colon in attribute header
|
||||||
careerLengthAttrConfig["selector"] = makeCommonXPath("Career Start And End")
|
careerLengthAttrConfig := makeSimpleAttrConfig(makeCommonXPath("Career Start And End"))
|
||||||
|
|
||||||
var careerLengthReplace []interface{}
|
var careerLengthReplace mappedRegexConfigs
|
||||||
careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, ""))
|
careerLengthReplace = append(careerLengthReplace, makeReplaceRegex(`\s+\(.*\)`, ""))
|
||||||
careerLengthAttrConfig["replace"] = careerLengthReplace
|
careerLengthReplaceAction := postProcessReplace(careerLengthReplace)
|
||||||
|
careerLengthAttrConfig.postProcessActions = []postProcessAction{
|
||||||
|
&careerLengthReplaceAction,
|
||||||
|
}
|
||||||
|
|
||||||
config["CareerLength"] = careerLengthAttrConfig
|
config.mappedConfig["CareerLength"] = careerLengthAttrConfig
|
||||||
|
|
||||||
|
// use map post-process action for gender
|
||||||
|
genderConfig := makeSimpleAttrConfig(makeCommonXPath("Profession:"))
|
||||||
|
genderMapAction := make(postProcessMap)
|
||||||
|
genderMapAction["Porn Star"] = "Female"
|
||||||
|
genderConfig.postProcessActions = []postProcessAction{
|
||||||
|
&genderMapAction,
|
||||||
|
}
|
||||||
|
|
||||||
|
config.mappedConfig["Gender"] = genderConfig
|
||||||
|
|
||||||
|
// use fixed for height
|
||||||
|
config.mappedConfig["Height"] = mappedScraperAttrConfig{
|
||||||
|
Fixed: "1234",
|
||||||
|
}
|
||||||
|
|
||||||
return config
|
return config
|
||||||
}
|
}
|
||||||
@@ -253,11 +287,15 @@ func TestScrapePerformerXPath(t *testing.T) {
|
|||||||
|
|
||||||
xpathConfig := makeXPathConfig()
|
xpathConfig := makeXPathConfig()
|
||||||
|
|
||||||
scraper := xpathScraper{
|
scraper := mappedScraper{
|
||||||
Performer: xpathConfig,
|
Performer: &xpathConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
performer, err := scraper.scrapePerformer(doc)
|
q := &xpathQuery{
|
||||||
|
doc: doc,
|
||||||
|
}
|
||||||
|
|
||||||
|
performer, err := scraper.scrapePerformer(q)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("Error scraping performer: %s", err.Error())
|
t.Errorf("Error scraping performer: %s", err.Error())
|
||||||
@@ -274,8 +312,11 @@ func TestScrapePerformerXPath(t *testing.T) {
|
|||||||
const fakeTits = "No"
|
const fakeTits = "No"
|
||||||
const careerLength = "2012 - 2019"
|
const careerLength = "2012 - 2019"
|
||||||
const tattoosPiercings = "None"
|
const tattoosPiercings = "None"
|
||||||
|
const gender = "Female"
|
||||||
|
const height = "1234"
|
||||||
|
|
||||||
verifyField(t, performerName, performer.Name, "Name")
|
verifyField(t, performerName, performer.Name, "Name")
|
||||||
|
verifyField(t, gender, performer.Gender, "Gender")
|
||||||
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
|
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
|
||||||
verifyField(t, country, performer.Country, "Country")
|
verifyField(t, country, performer.Country, "Country")
|
||||||
|
|
||||||
@@ -290,6 +331,7 @@ func TestScrapePerformerXPath(t *testing.T) {
|
|||||||
|
|
||||||
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
|
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
|
||||||
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
|
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
|
||||||
|
verifyField(t, height, performer.Height, "Piercings")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestConcatXPath(t *testing.T) {
|
func TestConcatXPath(t *testing.T) {
|
||||||
@@ -313,18 +355,25 @@ func TestConcatXPath(t *testing.T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
xpathConfig := make(xpathScraperConfig)
|
xpathConfig := make(mappedConfig)
|
||||||
nameAttrConfig := make(map[interface{}]interface{})
|
nameAttrConfig := mappedScraperAttrConfig{
|
||||||
nameAttrConfig["selector"] = "//div"
|
Selector: "//div",
|
||||||
nameAttrConfig["concat"] = separator
|
Concat: separator,
|
||||||
|
}
|
||||||
xpathConfig["Name"] = nameAttrConfig
|
xpathConfig["Name"] = nameAttrConfig
|
||||||
xpathConfig["EyeColor"] = "//span"
|
xpathConfig["EyeColor"] = makeSimpleAttrConfig("//span")
|
||||||
|
|
||||||
scraper := xpathScraper{
|
scraper := mappedScraper{
|
||||||
Performer: xpathConfig,
|
Performer: &mappedPerformerScraperConfig{
|
||||||
|
mappedConfig: xpathConfig,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
performer, err := scraper.scrapePerformer(doc)
|
q := &xpathQuery{
|
||||||
|
doc: doc,
|
||||||
|
}
|
||||||
|
|
||||||
|
performer, err := scraper.scrapePerformer(q)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("Error scraping performer: %s", err.Error())
|
t.Errorf("Error scraping performer: %s", err.Error())
|
||||||
@@ -342,50 +391,19 @@ const sceneHTML = `
|
|||||||
|
|
||||||
<head>
|
<head>
|
||||||
<title>Test Video - Pornhub.com</title>
|
<title>Test Video - Pornhub.com</title>
|
||||||
|
|
||||||
<meta property="og:title" content="Test Video" />
|
<meta property="og:title" content="Test Video" />
|
||||||
<meta property="og:description"
|
|
||||||
content="Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you're craving 3some XXX movies you'll find them here." />
|
|
||||||
<meta property="og:image"
|
|
||||||
content="https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg" />
|
|
||||||
|
|
||||||
<script type="application/ld+json">
|
<script type="application/ld+json">
|
||||||
{
|
{
|
||||||
"@context": "http://schema.org/",
|
|
||||||
"@type": "VideoObject",
|
|
||||||
"name": "Test Video",
|
"name": "Test Video",
|
||||||
"embedUrl": "https://www.pornhub.com/embed/ph5da270596459c",
|
|
||||||
"duration": "PT00H33M27S",
|
|
||||||
"thumbnailUrl": "https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg",
|
|
||||||
"uploadDate": "2019-10-13T00:33:51+00:00",
|
"uploadDate": "2019-10-13T00:33:51+00:00",
|
||||||
"description": "Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you're craving 3some XXX movies you'll find them here.",
|
"author" : "Mia Malkova"
|
||||||
"author" : "Mia Malkova", "interactionStatistic": [
|
|
||||||
{
|
|
||||||
"@type": "InteractionCounter",
|
|
||||||
"interactionType": "http://schema.org/WatchAction",
|
|
||||||
"userInteractionCount": "5,908,861"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"@type": "InteractionCounter",
|
|
||||||
"interactionType": "http://schema.org/LikeAction",
|
|
||||||
"userInteractionCount": "22,090"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body class="logged-out">
|
<body class="logged-out">
|
||||||
<div class="container ">
|
<div class="container ">
|
||||||
|
<div id="main-container" class="clearfix">
|
||||||
|
|
||||||
<div id="main-container" class="clearfix" data-delete-check="1" data-is-private="1" data-is-premium=""
|
|
||||||
data-liu="0" data-next-shuffle="ph5da270596459c" data-pkey="" data-platform-pc="1" data-playlist-check="0"
|
|
||||||
data-playlist-id-check="0" data-playlist-geo-check="0" data-friend="0" data-playlist-user-check="0"
|
|
||||||
data-playlist-video-check="0" data-playlist-shuffle="0" data-shuffle-forward="ph5da270596459c"
|
|
||||||
data-shuffle-back="ph5da270596459c" data-min-large="1350"
|
|
||||||
data-video-title="Test Video">
|
|
||||||
|
|
||||||
<div id="vpContentContainer">
|
<div id="vpContentContainer">
|
||||||
<div id="hd-leftColVideoPage">
|
<div id="hd-leftColVideoPage">
|
||||||
<div class="video-wrapper">
|
<div class="video-wrapper">
|
||||||
@@ -402,45 +420,27 @@ const sceneHTML = `
|
|||||||
<div class="video-detailed-info">
|
<div class="video-detailed-info">
|
||||||
<div class="video-info-row">
|
<div class="video-info-row">
|
||||||
From:
|
From:
|
||||||
|
<div class="usernameWrap clearfix" data-type="channel">
|
||||||
<div class="usernameWrap clearfix" data-type="channel" data-userid="492538092"
|
|
||||||
data-liu-user="0"
|
|
||||||
data-json-url="/user/box?id=492538092&token=MTU3NzA1NTkzNIqATol8v_WrhmNTXkeflvG09C2U7UUT_NyoZUFa7iKq0mlzBkmdgAH1aNHZkJmIOHbbwmho1BehHDoA63K5Wn4."
|
|
||||||
data-disable-popover="0">
|
|
||||||
|
|
||||||
<a rel="" href="/channels/sis-loves-me" class="bolded">Sis Loves Me</a>
|
<a rel="" href="/channels/sis-loves-me" class="bolded">Sis Loves Me</a>
|
||||||
<div class="avatarPosition"></div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<span class="verified-icon flag tooltipTrig"
|
|
||||||
data-title="Verified member"></span>
|
|
||||||
- 87 videos
|
|
||||||
<span class="subscribers-count"> 459466</span>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="video-info-row">
|
<div class="video-info-row">
|
||||||
<div class="pornstarsWrapper">
|
<div class="pornstarsWrapper">
|
||||||
Pornstars:
|
Pornstars:
|
||||||
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||||
data-mxptext="Alex D" data-id="251341" data-login="1"
|
data-mxptext="Alex D" href="/pornstar/alex-d">Alex D
|
||||||
href="/pornstar/alex-d">Alex D <span
|
|
||||||
class="psbox-link-container display-none"></span>
|
|
||||||
</a>
|
</a>
|
||||||
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||||
data-mxptext="Mia Malkova" data-id="10641" data-login="1"
|
data-mxptext="Mia Malkova" href="/pornstar/mia-malkova">
|
||||||
href="/pornstar/mia-malkova">Mia Malkova <span
|
|
||||||
class="psbox-link-container display-none"></span>
|
|
||||||
</a>
|
</a>
|
||||||
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||||
data-mxptext="Riley Reid" data-id="5343" data-login="1"
|
data-mxptext="Riley Reid" href="/pornstar/riley-reid">Riley Reid
|
||||||
href="/pornstar/riley-reid">Riley Reid <span
|
|
||||||
class="psbox-link-container display-none"></span>
|
|
||||||
</a>
|
</a>
|
||||||
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
|
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
|
||||||
<a class="add-btn-small add-pornstar-btn-2">+
|
<a class="add-btn-small add-pornstar-btn-2">+
|
||||||
<span>Suggest</span></a>
|
<span>Suggest</span></a>
|
||||||
</div>
|
</div>
|
||||||
<div id="deletePornstarResult" class="suggest-result"></div>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -475,14 +475,6 @@ const sceneHTML = `
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="video-info-row showLess">
|
|
||||||
<div class="productionWrapper">
|
|
||||||
Production:
|
|
||||||
<a href="/video?p=professional" rel="nofollow"
|
|
||||||
class="production">professional</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="video-info-row showLess">
|
<div class="video-info-row showLess">
|
||||||
<div class="tagsWrapper">
|
<div class="tagsWrapper">
|
||||||
Tags:
|
Tags:
|
||||||
@@ -510,121 +502,6 @@ const sceneHTML = `
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="video-info-row showLess">
|
|
||||||
Added on: <span class="white">2 months ago</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="video-info-row showLess">
|
|
||||||
Featured on: <span class="white">1 month ago</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="video-action-tab jump-to-tab">
|
|
||||||
<div class="title">Jump to your favorite action</div>
|
|
||||||
|
|
||||||
<div class="filters mainFilter float-right">
|
|
||||||
<div class="dropdownTrigger">
|
|
||||||
<div>
|
|
||||||
<span class="textFilter" id="tagSort">Sequence</span>
|
|
||||||
<span class="arrowFilters"></span>
|
|
||||||
</div>
|
|
||||||
<ul class="filterListItem dropdownWrapper">
|
|
||||||
<li class="active"><a class="actionTagSort"
|
|
||||||
data-sort="seconds">Sequence</a></li>
|
|
||||||
<li><a class="actionTagSort" data-sort="tag">Alphabetical</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="reset"></div>
|
|
||||||
<div class="display-grid col-4 gap-row-none sortBy seconds">
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
|
|
||||||
Blowjob </a>
|
|
||||||
|
|
||||||
<var>14:22</var>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
|
|
||||||
Reverse Cowgirl </a>
|
|
||||||
|
|
||||||
<var>18:37</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
|
||||||
Cowgirl </a>
|
|
||||||
|
|
||||||
<var>19:42</var>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
|
||||||
Cowgirl </a>
|
|
||||||
|
|
||||||
<var>27:05</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
|
|
||||||
Doggystyle </a>
|
|
||||||
|
|
||||||
<var>30:22</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
<div class="display-grid col-4 gap-row-none sortBy tag">
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
|
|
||||||
Blowjob </a>
|
|
||||||
|
|
||||||
<var>14:22</var>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
|
|
||||||
Reverse Cowgirl </a>
|
|
||||||
|
|
||||||
<var>18:37</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
|
||||||
Cowgirl </a>
|
|
||||||
|
|
||||||
<var>19:42</var>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
|
||||||
Cowgirl </a>
|
|
||||||
|
|
||||||
<var>27:05</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
<ul class="actionTagList full-width margin-none">
|
|
||||||
<li>
|
|
||||||
<a class="js-triggerJumpCat"
|
|
||||||
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
|
|
||||||
Doggystyle </a>
|
|
||||||
|
|
||||||
<var>30:22</var>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -637,42 +514,45 @@ const sceneHTML = `
|
|||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|
||||||
func makeSceneXPathConfig() xpathScraper {
|
func makeSceneXPathConfig() mappedScraper {
|
||||||
common := make(commonXPathConfig)
|
common := make(commonMappedConfig)
|
||||||
|
|
||||||
common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]`
|
common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]`
|
||||||
common["$studioElem"] = `//div[@data-type="channel"]/a`
|
common["$studioElem"] = `//div[@data-type="channel"]/a`
|
||||||
|
|
||||||
config := make(xpathScraperConfig)
|
config := mappedSceneScraperConfig{
|
||||||
|
mappedConfig: make(mappedConfig),
|
||||||
|
}
|
||||||
|
|
||||||
config["Title"] = `//meta[@property="og:title"]/@content`
|
config.mappedConfig["Title"] = makeSimpleAttrConfig(`//meta[@property="og:title"]/@content`)
|
||||||
// this needs post-processing
|
// this needs post-processing
|
||||||
config["Date"] = `//script[@type="application/ld+json"]`
|
config.mappedConfig["Date"] = makeSimpleAttrConfig(`//script[@type="application/ld+json"]`)
|
||||||
|
|
||||||
tagConfig := make(map[interface{}]interface{})
|
tagConfig := make(mappedConfig)
|
||||||
tagConfig["Name"] = `//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`
|
tagConfig["Name"] = makeSimpleAttrConfig(`//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`)
|
||||||
config["Tags"] = tagConfig
|
config.Tags = tagConfig
|
||||||
|
|
||||||
performerConfig := make(map[interface{}]interface{})
|
performerConfig := make(mappedConfig)
|
||||||
performerConfig["Name"] = `$performerElem/@data-mxptext`
|
performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`)
|
||||||
performerConfig["URL"] = `$performerElem/@href`
|
performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`)
|
||||||
config["Performers"] = performerConfig
|
config.Performers = performerConfig
|
||||||
|
|
||||||
studioConfig := make(map[interface{}]interface{})
|
studioConfig := make(mappedConfig)
|
||||||
studioConfig["Name"] = `$studioElem`
|
studioConfig["Name"] = makeSimpleAttrConfig(`$studioElem`)
|
||||||
studioConfig["URL"] = `$studioElem/@href`
|
studioConfig["URL"] = makeSimpleAttrConfig(`$studioElem/@href`)
|
||||||
config["Studio"] = studioConfig
|
config.Studio = studioConfig
|
||||||
|
|
||||||
const sep = " "
|
const sep = " "
|
||||||
moviesNameConfig := make(map[interface{}]interface{})
|
moviesNameConfig := mappedScraperAttrConfig{
|
||||||
moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title`
|
Selector: `//i[@class="isMe tooltipTrig"]/@data-title`,
|
||||||
moviesNameConfig["split"] = sep
|
Split: sep,
|
||||||
moviesConfig := make(map[interface{}]interface{})
|
}
|
||||||
|
moviesConfig := make(mappedConfig)
|
||||||
moviesConfig["Name"] = moviesNameConfig
|
moviesConfig["Name"] = moviesNameConfig
|
||||||
config["Movies"] = moviesConfig
|
config.Movies = moviesConfig
|
||||||
|
|
||||||
scraper := xpathScraper{
|
scraper := mappedScraper{
|
||||||
Scene: config,
|
Scene: &config,
|
||||||
Common: common,
|
Common: common,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -764,7 +644,10 @@ func TestApplySceneXPathConfig(t *testing.T) {
|
|||||||
|
|
||||||
scraper := makeSceneXPathConfig()
|
scraper := makeSceneXPathConfig()
|
||||||
|
|
||||||
scene, err := scraper.scrapeScene(doc)
|
q := &xpathQuery{
|
||||||
|
doc: doc,
|
||||||
|
}
|
||||||
|
scene, err := scraper.scrapeScene(q)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("Error scraping scene: %s", err.Error())
|
t.Errorf("Error scraping scene: %s", err.Error())
|
||||||
@@ -831,21 +714,49 @@ xPathScrapers:
|
|||||||
performerScraper:
|
performerScraper:
|
||||||
performer:
|
performer:
|
||||||
name: //h1[@itemprop="name"]
|
name: //h1[@itemprop="name"]
|
||||||
|
sceneScraper:
|
||||||
|
scene:
|
||||||
|
Title:
|
||||||
|
selector: //title
|
||||||
|
postProcess:
|
||||||
|
- parseDate: January 2, 2006
|
||||||
|
Tags:
|
||||||
|
Name: //tags
|
||||||
|
Movies:
|
||||||
|
Name: //movies
|
||||||
|
Performers:
|
||||||
|
Name: //performers
|
||||||
|
Studio:
|
||||||
|
Name: //studio
|
||||||
`
|
`
|
||||||
|
|
||||||
config := &scraperConfig{}
|
c := &config{}
|
||||||
err := yaml.Unmarshal([]byte(yamlStr), &config)
|
err := yaml.Unmarshal([]byte(yamlStr), &c)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("Error loading yaml: %s", err.Error())
|
t.Errorf("Error loading yaml: %s", err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensure fields are filled in correctly
|
||||||
|
sceneScraper := c.XPathScrapers["sceneScraper"]
|
||||||
|
sceneConfig := sceneScraper.Scene
|
||||||
|
|
||||||
|
assert.Equal(t, "//title", sceneConfig.mappedConfig["Title"].Selector)
|
||||||
|
assert.Equal(t, "//tags", sceneConfig.Tags["Name"].Selector)
|
||||||
|
assert.Equal(t, "//movies", sceneConfig.Movies["Name"].Selector)
|
||||||
|
assert.Equal(t, "//performers", sceneConfig.Performers["Name"].Selector)
|
||||||
|
assert.Equal(t, "//studio", sceneConfig.Studio["Name"].Selector)
|
||||||
|
|
||||||
|
postProcess := sceneConfig.mappedConfig["Title"].postProcessActions
|
||||||
|
parseDate := postProcess[0].(*postProcessParseDate)
|
||||||
|
assert.Equal(t, "January 2, 2006", string(*parseDate))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadInvalidXPath(t *testing.T) {
|
func TestLoadInvalidXPath(t *testing.T) {
|
||||||
config := make(xpathScraperConfig)
|
config := make(mappedConfig)
|
||||||
|
|
||||||
config["Name"] = `//a[id=']/span`
|
config["Name"] = makeSimpleAttrConfig(`//a[id=']/span`)
|
||||||
|
|
||||||
reader := strings.NewReader(htmlDoc1)
|
reader := strings.NewReader(htmlDoc1)
|
||||||
doc, err := htmlquery.Parse(reader)
|
doc, err := htmlquery.Parse(reader)
|
||||||
@@ -855,6 +766,68 @@ func TestLoadInvalidXPath(t *testing.T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
common := make(commonXPathConfig)
|
q := &xpathQuery{
|
||||||
config.process(doc, common)
|
doc: doc,
|
||||||
|
}
|
||||||
|
|
||||||
|
config.process(q, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSubScrape(t *testing.T) {
|
||||||
|
retHTML := `
|
||||||
|
<div>
|
||||||
|
<a href="/getName">A link</a>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
|
||||||
|
ssHTML := `
|
||||||
|
<span>The name</span>
|
||||||
|
`
|
||||||
|
|
||||||
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.URL.Path == "/getName" {
|
||||||
|
fmt.Fprint(w, ssHTML)
|
||||||
|
} else {
|
||||||
|
fmt.Fprint(w, retHTML)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer ts.Close()
|
||||||
|
|
||||||
|
yamlStr := `name: Test
|
||||||
|
performerByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
url:
|
||||||
|
- ` + ts.URL + `
|
||||||
|
scraper: performerScraper
|
||||||
|
xPathScrapers:
|
||||||
|
performerScraper:
|
||||||
|
performer:
|
||||||
|
Name:
|
||||||
|
selector: //div/a/@href
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^
|
||||||
|
with: ` + ts.URL + `
|
||||||
|
- subScraper:
|
||||||
|
selector: //span
|
||||||
|
`
|
||||||
|
|
||||||
|
c := &config{}
|
||||||
|
err := yaml.Unmarshal([]byte(yamlStr), &c)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error loading yaml: %s", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
globalConfig := GlobalConfig{}
|
||||||
|
|
||||||
|
performer, err := c.ScrapePerformerURL(ts.URL, globalConfig)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error scraping performer: %s", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
verifyField(t, "The name", performer.Name, "Name")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ const markup = `
|
|||||||
* Add support for parent/child studios.
|
* Add support for parent/child studios.
|
||||||
|
|
||||||
### 🎨 Improvements
|
### 🎨 Improvements
|
||||||
|
* Add mapped and fixed post-processing scraping options.
|
||||||
* Add random sorting for performers.
|
* Add random sorting for performers.
|
||||||
* Search for files which have low or upper case supported filename extensions.
|
* Search for files which have low or upper case supported filename extensions.
|
||||||
* Add dialog when pasting movie images.
|
* Add dialog when pasting movie images.
|
||||||
|
|||||||
@@ -209,15 +209,26 @@ performer:
|
|||||||
|
|
||||||
This will set the `Name` attribute of the returned performer to the text content of the element that matches `<h1 itemprop="name">...`.
|
This will set the `Name` attribute of the returned performer to the text content of the element that matches `<h1 itemprop="name">...`.
|
||||||
|
|
||||||
The value may also be a sub-object, indicating that post-processing is required. If it is a sub-object, then the xpath must be set to the `selector` key of the sub-object. For example, using the same xpath as above:
|
The value may also be a sub-object. If it is a sub-object, then the xpath must be set to the `selector` key of the sub-object. For example, using the same xpath as above:
|
||||||
|
|
||||||
```
|
```
|
||||||
performer:
|
performer:
|
||||||
Name:
|
Name:
|
||||||
selector: //h1[@itemprop="name"]
|
selector: //h1[@itemprop="name"]
|
||||||
|
postProcess:
|
||||||
# post-processing config values
|
# post-processing config values
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Fixed attribute values
|
||||||
|
|
||||||
|
Alternatively, an attribute value may be set to a fixed value, rather than scraping it from the webpage. This can be done by replacing `selector` with `fixed`. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
performer:
|
||||||
|
Gender:
|
||||||
|
fixed: Female
|
||||||
|
```
|
||||||
|
|
||||||
##### Common fragments
|
##### Common fragments
|
||||||
|
|
||||||
The `common` field is used to configure xpath fragments that can be referenced in the xpath strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example:
|
The `common` field is used to configure xpath fragments that can be referenced in the xpath strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example:
|
||||||
@@ -233,14 +244,44 @@ The `Measurements` xpath string will replace `$infoPiece` with `//div[@class="in
|
|||||||
|
|
||||||
##### Post-processing options
|
##### Post-processing options
|
||||||
|
|
||||||
The following post-processing keys are available:
|
Post-processing operations are contained in the `postProcess` key. Post-processing operations are performed in the order they are specified. The following post-processing operations are available:
|
||||||
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
|
* `map`: contains a map of input values to output values. Where a value matches one of the input values, it is replaced with the matching output value. If no value is matched, then value is unmodified.
|
||||||
* `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `` is the first capture group, `` the second and so on. Replacements are performed in order of the array.
|
|
||||||
|
Example:
|
||||||
|
```
|
||||||
|
performer:
|
||||||
|
Gender:
|
||||||
|
selector: //div[class="example element"]
|
||||||
|
postProcess:
|
||||||
|
- map:
|
||||||
|
F: Female
|
||||||
|
M: Male
|
||||||
|
```
|
||||||
|
Gets the contents of the selected div element, and sets the returned value to `Female` if the scraped value is `F`; `Male` if the scraped value is `M`.
|
||||||
|
|
||||||
|
* `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `$1` is the first capture group, `$2` the second and so on. Replacements are performed in order of the array.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```
|
||||||
|
CareerLength:
|
||||||
|
selector: $infoPiece[text() = 'Career Start and End:']/../span[@class="smallInfo"]
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: \s+to\s+
|
||||||
|
with: "-"
|
||||||
|
```
|
||||||
|
Replaces `2001 to 2003` with `2001-2003`.
|
||||||
|
|
||||||
* `subScraper`: if present, the sub-scraper will be executed after all other post-processes are complete and before parseDate. It then takes the value and performs an http request, using the value as the URL. Within the `subScraper` config is a nested scraping configuration. This allows you to traverse to other webpages to get the attribute value you are after. For more info and examples have a look at [#370](https://github.com/stashapp/stash/pull/370), [#606](https://github.com/stashapp/stash/pull/606)
|
* `subScraper`: if present, the sub-scraper will be executed after all other post-processes are complete and before parseDate. It then takes the value and performs an http request, using the value as the URL. Within the `subScraper` config is a nested scraping configuration. This allows you to traverse to other webpages to get the attribute value you are after. For more info and examples have a look at [#370](https://github.com/stashapp/stash/pull/370), [#606](https://github.com/stashapp/stash/pull/606)
|
||||||
* `parseDate`: if present, the value is the date format using go's reference date (2006-01-02). For example, if an example date was `14-Mar-2003`, then the date format would be `02-Jan-2006`. See the [time.Parse documentation](https://golang.org/pkg/time/#Parse) for details. When present, the scraper will convert the input string into a date, then convert it to the string format used by stash (`YYYY-MM-DD`).
|
* `parseDate`: if present, the value is the date format using go's reference date (2006-01-02). For example, if an example date was `14-Mar-2003`, then the date format would be `02-Jan-2006`. See the [time.Parse documentation](https://golang.org/pkg/time/#Parse) for details. When present, the scraper will convert the input string into a date, then convert it to the string format used by stash (`YYYY-MM-DD`).
|
||||||
|
|
||||||
|
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
|
||||||
|
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
|
||||||
* `split`: Its the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
|
* `split`: Its the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
|
||||||
|
|
||||||
Post-processing is done in order of the fields above - `concat`, `regex`, `subscraper`, `parseDate` and then `split`.
|
For backwards compatibility, `regex`, `subscraper` and `parseDate` are also allowed as keys for the attribute.
|
||||||
|
|
||||||
|
Post-processing on attribute post-process is done in the following order: `concat`, `regex`, `subscraper`, `parseDate` and then `split`.
|
||||||
|
|
||||||
##### Example
|
##### Example
|
||||||
|
|
||||||
@@ -272,7 +313,8 @@ xPathScrapers:
|
|||||||
Measurements: $infoPiece[text() = 'Measurements:']/../span[@class="smallInfo"]
|
Measurements: $infoPiece[text() = 'Measurements:']/../span[@class="smallInfo"]
|
||||||
Height:
|
Height:
|
||||||
selector: $infoPiece[text() = 'Height:']/../span[@class="smallInfo"]
|
selector: $infoPiece[text() = 'Height:']/../span[@class="smallInfo"]
|
||||||
replace:
|
postProcess:
|
||||||
|
- replace:
|
||||||
- regex: .*\((\d+) cm\)
|
- regex: .*\((\d+) cm\)
|
||||||
with: $1
|
with: $1
|
||||||
Ethnicity: $infoPiece[text() = 'Ethnicity:']/../span[@class="smallInfo"]
|
Ethnicity: $infoPiece[text() = 'Ethnicity:']/../span[@class="smallInfo"]
|
||||||
@@ -281,7 +323,8 @@ xPathScrapers:
|
|||||||
Tattoos: $infoPiece[text() = 'Tattoos:']/../span[@class="smallInfo"]
|
Tattoos: $infoPiece[text() = 'Tattoos:']/../span[@class="smallInfo"]
|
||||||
CareerLength:
|
CareerLength:
|
||||||
selector: $infoPiece[text() = 'Career Start and End:']/../span[@class="smallInfo"]
|
selector: $infoPiece[text() = 'Career Start and End:']/../span[@class="smallInfo"]
|
||||||
replace:
|
postProcess:
|
||||||
|
- replace:
|
||||||
- regex: \s+to\s+
|
- regex: \s+to\s+
|
||||||
with: "-"
|
with: "-"
|
||||||
sceneScraper:
|
sceneScraper:
|
||||||
|
|||||||
Reference in New Issue
Block a user