From 7fdaccf669514423f2a94940b6e1a8965b67e689 Mon Sep 17 00:00:00 2001
From: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
Date: Sun, 5 Jan 2020 03:39:33 +1100
Subject: [PATCH] Xpath scraping from URL (#285)
* Add xpath performer and scene scraping
* Add studio scraping
* Refactor code
* Fix compile error
* Don't overwrite performer URL during a scrape
---
go.mod | 3 +
go.sum | 5 +
pkg/scraper/config.go | 17 +-
pkg/scraper/xpath.go | 267 +
pkg/scraper/xpath_test.go | 732 +
.../performers/PerformerDetails/Performer.tsx | 5 +
.../github.com/antchfx/htmlquery/.gitignore | 32 +
.../github.com/antchfx/htmlquery/.travis.yml | 16 +
vendor/github.com/antchfx/htmlquery/LICENSE | 17 +
vendor/github.com/antchfx/htmlquery/README.md | 158 +
vendor/github.com/antchfx/htmlquery/cache.go | 40 +
vendor/github.com/antchfx/htmlquery/query.go | 338 +
vendor/github.com/antchfx/xpath/.gitignore | 32 +
vendor/github.com/antchfx/xpath/.travis.yml | 12 +
vendor/github.com/antchfx/xpath/LICENSE | 17 +
vendor/github.com/antchfx/xpath/README.md | 170 +
vendor/github.com/antchfx/xpath/build.go | 483 +
vendor/github.com/antchfx/xpath/func.go | 492 +
vendor/github.com/antchfx/xpath/func_go110.go | 9 +
.../antchfx/xpath/func_pre_go110.go | 15 +
vendor/github.com/antchfx/xpath/operator.go | 295 +
vendor/github.com/antchfx/xpath/parse.go | 1186 +
vendor/github.com/antchfx/xpath/query.go | 862 +
vendor/github.com/antchfx/xpath/xpath.go | 157 +
vendor/github.com/golang/groupcache/LICENSE | 191 +
.../github.com/golang/groupcache/lru/lru.go | 133 +
.../golang.org/x/net/html/charset/charset.go | 257 +
.../x/text/encoding/charmap/charmap.go | 249 +
.../x/text/encoding/charmap/maketables.go | 556 +
.../x/text/encoding/charmap/tables.go | 7410 +++
vendor/golang.org/x/text/encoding/encoding.go | 335 +
.../x/text/encoding/htmlindex/gen.go | 173 +
.../x/text/encoding/htmlindex/htmlindex.go | 86 +
.../x/text/encoding/htmlindex/map.go | 105 +
.../x/text/encoding/htmlindex/tables.go | 353 +
.../text/encoding/internal/identifier/gen.go | 142 +
.../internal/identifier/identifier.go | 81 +
.../text/encoding/internal/identifier/mib.go | 1619 +
.../x/text/encoding/internal/internal.go | 75 +
.../x/text/encoding/japanese/all.go | 12 +
.../x/text/encoding/japanese/eucjp.go | 225 +
.../x/text/encoding/japanese/iso2022jp.go | 299 +
.../x/text/encoding/japanese/maketables.go | 161 +
.../x/text/encoding/japanese/shiftjis.go | 189 +
.../x/text/encoding/japanese/tables.go | 26971 ++++++++++
.../x/text/encoding/korean/euckr.go | 177 +
.../x/text/encoding/korean/maketables.go | 143 +
.../x/text/encoding/korean/tables.go | 34152 ++++++++++++
.../x/text/encoding/simplifiedchinese/all.go | 12 +
.../x/text/encoding/simplifiedchinese/gbk.go | 269 +
.../encoding/simplifiedchinese/hzgb2312.go | 245 +
.../encoding/simplifiedchinese/maketables.go | 161 +
.../text/encoding/simplifiedchinese/tables.go | 43999 ++++++++++++++++
.../text/encoding/traditionalchinese/big5.go | 199 +
.../encoding/traditionalchinese/maketables.go | 140 +
.../encoding/traditionalchinese/tables.go | 37142 +++++++++++++
.../x/text/encoding/unicode/override.go | 82 +
.../x/text/encoding/unicode/unicode.go | 434 +
.../x/text/internal/language/common.go | 16 +
.../x/text/internal/language/compact.go | 29 +
.../text/internal/language/compact/compact.go | 61 +
.../x/text/internal/language/compact/gen.go | 64 +
.../internal/language/compact/gen_index.go | 113 +
.../internal/language/compact/gen_parents.go | 54 +
.../internal/language/compact/language.go | 260 +
.../text/internal/language/compact/parents.go | 120 +
.../text/internal/language/compact/tables.go | 1015 +
.../x/text/internal/language/compact/tags.go | 91 +
.../x/text/internal/language/compose.go | 167 +
.../x/text/internal/language/coverage.go | 28 +
.../x/text/internal/language/gen.go | 1520 +
.../x/text/internal/language/gen_common.go | 20 +
.../x/text/internal/language/language.go | 596 +
.../x/text/internal/language/lookup.go | 412 +
.../x/text/internal/language/match.go | 226 +
.../x/text/internal/language/parse.go | 594 +
.../x/text/internal/language/tables.go | 3431 ++
.../x/text/internal/language/tags.go | 48 +
vendor/golang.org/x/text/internal/tag/tag.go | 100 +
.../internal/utf8internal/utf8internal.go | 87 +
vendor/golang.org/x/text/language/coverage.go | 187 +
vendor/golang.org/x/text/language/doc.go | 102 +
vendor/golang.org/x/text/language/gen.go | 305 +
vendor/golang.org/x/text/language/go1_1.go | 38 +
vendor/golang.org/x/text/language/go1_2.go | 11 +
vendor/golang.org/x/text/language/language.go | 601 +
vendor/golang.org/x/text/language/match.go | 735 +
vendor/golang.org/x/text/language/parse.go | 228 +
vendor/golang.org/x/text/language/tables.go | 298 +
vendor/golang.org/x/text/language/tags.go | 145 +
vendor/golang.org/x/text/runes/cond.go | 187 +
vendor/golang.org/x/text/runes/runes.go | 355 +
vendor/modules.txt | 23 +
93 files changed, 174400 insertions(+), 4 deletions(-)
create mode 100644 pkg/scraper/xpath.go
create mode 100644 pkg/scraper/xpath_test.go
create mode 100644 vendor/github.com/antchfx/htmlquery/.gitignore
create mode 100644 vendor/github.com/antchfx/htmlquery/.travis.yml
create mode 100644 vendor/github.com/antchfx/htmlquery/LICENSE
create mode 100644 vendor/github.com/antchfx/htmlquery/README.md
create mode 100644 vendor/github.com/antchfx/htmlquery/cache.go
create mode 100644 vendor/github.com/antchfx/htmlquery/query.go
create mode 100644 vendor/github.com/antchfx/xpath/.gitignore
create mode 100644 vendor/github.com/antchfx/xpath/.travis.yml
create mode 100644 vendor/github.com/antchfx/xpath/LICENSE
create mode 100644 vendor/github.com/antchfx/xpath/README.md
create mode 100644 vendor/github.com/antchfx/xpath/build.go
create mode 100644 vendor/github.com/antchfx/xpath/func.go
create mode 100644 vendor/github.com/antchfx/xpath/func_go110.go
create mode 100644 vendor/github.com/antchfx/xpath/func_pre_go110.go
create mode 100644 vendor/github.com/antchfx/xpath/operator.go
create mode 100644 vendor/github.com/antchfx/xpath/parse.go
create mode 100644 vendor/github.com/antchfx/xpath/query.go
create mode 100644 vendor/github.com/antchfx/xpath/xpath.go
create mode 100644 vendor/github.com/golang/groupcache/LICENSE
create mode 100644 vendor/github.com/golang/groupcache/lru/lru.go
create mode 100644 vendor/golang.org/x/net/html/charset/charset.go
create mode 100644 vendor/golang.org/x/text/encoding/charmap/charmap.go
create mode 100644 vendor/golang.org/x/text/encoding/charmap/maketables.go
create mode 100644 vendor/golang.org/x/text/encoding/charmap/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/encoding.go
create mode 100644 vendor/golang.org/x/text/encoding/htmlindex/gen.go
create mode 100644 vendor/golang.org/x/text/encoding/htmlindex/htmlindex.go
create mode 100644 vendor/golang.org/x/text/encoding/htmlindex/map.go
create mode 100644 vendor/golang.org/x/text/encoding/htmlindex/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/internal/identifier/gen.go
create mode 100644 vendor/golang.org/x/text/encoding/internal/identifier/identifier.go
create mode 100644 vendor/golang.org/x/text/encoding/internal/identifier/mib.go
create mode 100644 vendor/golang.org/x/text/encoding/internal/internal.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/all.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/eucjp.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/iso2022jp.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/maketables.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/shiftjis.go
create mode 100644 vendor/golang.org/x/text/encoding/japanese/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/korean/euckr.go
create mode 100644 vendor/golang.org/x/text/encoding/korean/maketables.go
create mode 100644 vendor/golang.org/x/text/encoding/korean/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/simplifiedchinese/all.go
create mode 100644 vendor/golang.org/x/text/encoding/simplifiedchinese/gbk.go
create mode 100644 vendor/golang.org/x/text/encoding/simplifiedchinese/hzgb2312.go
create mode 100644 vendor/golang.org/x/text/encoding/simplifiedchinese/maketables.go
create mode 100644 vendor/golang.org/x/text/encoding/simplifiedchinese/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/traditionalchinese/big5.go
create mode 100644 vendor/golang.org/x/text/encoding/traditionalchinese/maketables.go
create mode 100644 vendor/golang.org/x/text/encoding/traditionalchinese/tables.go
create mode 100644 vendor/golang.org/x/text/encoding/unicode/override.go
create mode 100644 vendor/golang.org/x/text/encoding/unicode/unicode.go
create mode 100644 vendor/golang.org/x/text/internal/language/common.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/compact.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/gen.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/gen_index.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/gen_parents.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/language.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/parents.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/tables.go
create mode 100644 vendor/golang.org/x/text/internal/language/compact/tags.go
create mode 100644 vendor/golang.org/x/text/internal/language/compose.go
create mode 100644 vendor/golang.org/x/text/internal/language/coverage.go
create mode 100644 vendor/golang.org/x/text/internal/language/gen.go
create mode 100644 vendor/golang.org/x/text/internal/language/gen_common.go
create mode 100644 vendor/golang.org/x/text/internal/language/language.go
create mode 100644 vendor/golang.org/x/text/internal/language/lookup.go
create mode 100644 vendor/golang.org/x/text/internal/language/match.go
create mode 100644 vendor/golang.org/x/text/internal/language/parse.go
create mode 100644 vendor/golang.org/x/text/internal/language/tables.go
create mode 100644 vendor/golang.org/x/text/internal/language/tags.go
create mode 100644 vendor/golang.org/x/text/internal/tag/tag.go
create mode 100644 vendor/golang.org/x/text/internal/utf8internal/utf8internal.go
create mode 100644 vendor/golang.org/x/text/language/coverage.go
create mode 100644 vendor/golang.org/x/text/language/doc.go
create mode 100644 vendor/golang.org/x/text/language/gen.go
create mode 100644 vendor/golang.org/x/text/language/go1_1.go
create mode 100644 vendor/golang.org/x/text/language/go1_2.go
create mode 100644 vendor/golang.org/x/text/language/language.go
create mode 100644 vendor/golang.org/x/text/language/match.go
create mode 100644 vendor/golang.org/x/text/language/parse.go
create mode 100644 vendor/golang.org/x/text/language/tables.go
create mode 100644 vendor/golang.org/x/text/language/tags.go
create mode 100644 vendor/golang.org/x/text/runes/cond.go
create mode 100644 vendor/golang.org/x/text/runes/runes.go
diff --git a/go.mod b/go.mod
index ae6b54f54..ecbeb8fba 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,8 @@ module github.com/stashapp/stash
require (
github.com/99designs/gqlgen v0.9.0
github.com/PuerkitoBio/goquery v1.5.0
+ github.com/antchfx/htmlquery v1.2.0
+ github.com/antchfx/xpath v1.1.2 // indirect
github.com/bmatcuk/doublestar v1.1.5
github.com/disintegration/imaging v1.6.0
github.com/go-chi/chi v4.0.2+incompatible
@@ -21,6 +23,7 @@ require (
github.com/vektah/gqlparser v1.1.2
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
golang.org/x/image v0.0.0-20190118043309-183bebdce1b2 // indirect
+ golang.org/x/net v0.0.0-20190522155817-f3200d17e092
gopkg.in/yaml.v2 v2.2.2
)
diff --git a/go.sum b/go.sum
index 952018211..12943ceed 100644
--- a/go.sum
+++ b/go.sum
@@ -30,6 +30,10 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
+github.com/antchfx/htmlquery v1.2.0 h1:oKShnsGlnOHX6t4uj5OHgLKkABcJoqnXpqnscoi9Lpw=
+github.com/antchfx/htmlquery v1.2.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
+github.com/antchfx/xpath v1.1.2 h1:YziPrtM0gEJBnhdUGxYcIVYXZ8FXbtbovxOi+UW/yWQ=
+github.com/antchfx/xpath v1.1.2/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
@@ -318,6 +322,7 @@ github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zV
github.com/golang-migrate/migrate/v4 v4.3.1 h1:3eR1NY+pplX+m6yJ1fQf5dFWX3fBgUtZfDiaS/kJVu4=
github.com/golang-migrate/migrate/v4 v4.3.1/go.mod h1:mJ89KBgbXmM3P49BqOxRL3riNF/ATlg5kMhm17GA0dE=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef h1:veQD95Isof8w9/WXiA+pa3tz3fJXkt5B7QaRBrM62gk=
github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go
index b073854ec..ee672949b 100644
--- a/pkg/scraper/config.go
+++ b/pkg/scraper/config.go
@@ -19,24 +19,27 @@ type scraperAction string
const (
scraperActionScript scraperAction = "script"
scraperActionStash scraperAction = "stash"
+ scraperActionXPath scraperAction = "scrapeXPath"
)
var allScraperAction = []scraperAction{
scraperActionScript,
scraperActionStash,
+ scraperActionXPath,
}
func (e scraperAction) IsValid() bool {
switch e {
- case scraperActionScript:
+ case scraperActionScript, scraperActionStash, scraperActionXPath:
return true
}
return false
}
type scraperTypeConfig struct {
- Action scraperAction `yaml:"action"`
- Script []string `yaml:"script,flow"`
+ Action scraperAction `yaml:"action"`
+ Script []string `yaml:"script,flow"`
+ Scraper string `yaml:"scraper"`
scraperConfig *scraperConfig
}
@@ -96,6 +99,8 @@ type scrapePerformerByURLConfig struct {
func (c *scrapePerformerByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapePerformerURLScript
+ } else if c.Action == scraperActionXPath {
+ c.performScrape = scrapePerformerURLXpath
}
}
@@ -124,6 +129,8 @@ type scrapeSceneByURLConfig struct {
func (c *scrapeSceneByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapeSceneURLScript
+ } else if c.Action == scraperActionXPath {
+ c.performScrape = scrapeSceneURLXPath
}
}
@@ -135,7 +142,9 @@ type scraperConfig struct {
PerformerByURL []*scrapePerformerByURLConfig `yaml:"performerByURL"`
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
- StashServer *stashServer `yaml:"stashServer"`
+
+ StashServer *stashServer `yaml:"stashServer"`
+ XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}
func loadScraperFromYAML(path string) (*scraperConfig, error) {
diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go
new file mode 100644
index 000000000..ab4902b83
--- /dev/null
+++ b/pkg/scraper/xpath.go
@@ -0,0 +1,267 @@
+package scraper
+
+import (
+ "errors"
+ "reflect"
+ "regexp"
+ "strings"
+
+ "github.com/antchfx/htmlquery"
+ "golang.org/x/net/html"
+
+ "github.com/stashapp/stash/pkg/logger"
+ "github.com/stashapp/stash/pkg/models"
+)
+
+type commonXPathConfig map[string]string
+
+func (c commonXPathConfig) applyCommon(src string) string {
+ ret := src
+ for commonKey, commonVal := range c {
+ if strings.Contains(ret, commonKey) {
+ ret = strings.Replace(ret, commonKey, commonVal, -1)
+ }
+ }
+
+ return ret
+}
+
+type xpathScraperConfig map[string]interface{}
+
+func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
+ ret := make(xpathScraperConfig)
+
+ if src != nil {
+ for k, v := range src {
+ keyStr, isStr := k.(string)
+ if isStr {
+ ret[keyStr] = v
+ }
+ }
+ }
+
+ return ret
+}
+
+func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
+ var ret []xPathResult
+
+ for k, v := range s {
+ asStr, isStr := v.(string)
+
+ if isStr {
+ // apply common
+ if common != nil {
+ asStr = common.applyCommon(asStr)
+ }
+
+ found := htmlquery.Find(doc, asStr)
+ if len(found) > 0 {
+ for i, elem := range found {
+ if i >= len(ret) {
+ ret = append(ret, make(xPathResult))
+ }
+
+ ret[i][k] = elem
+ }
+ }
+ }
+ // TODO - handle map type
+ }
+
+ return ret
+}
+
+type xpathScrapers map[string]*xpathScraper
+
+type xpathScraper struct {
+ Common commonXPathConfig `yaml:"common"`
+ Scene xpathScraperConfig `yaml:"scene"`
+ Performer xpathScraperConfig `yaml:"performer"`
+}
+
+const (
+ XPathScraperConfigSceneTags = "Tags"
+ XPathScraperConfigScenePerformers = "Performers"
+ XPathScraperConfigSceneStudio = "Studio"
+)
+
+func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
+ // exclude the complex sub-configs
+ ret := make(xpathScraperConfig)
+ mapped := s.Scene
+
+ if mapped != nil {
+ for k, v := range mapped {
+ if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio {
+ ret[k] = v
+ }
+ }
+ }
+
+ return ret
+}
+
+func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
+ var ret map[interface{}]interface{}
+ mapped := s.Scene
+
+ if mapped != nil {
+ v, ok := mapped[key]
+ if ok {
+ ret, _ = v.(map[interface{}]interface{})
+ }
+ }
+
+ if ret != nil {
+ return createXPathScraperConfig(ret)
+ }
+
+ return nil
+}
+
+func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
+ return s.getSceneSubMap(XPathScraperConfigScenePerformers)
+}
+
+func (s xpathScraper) GetSceneTags() xpathScraperConfig {
+ return s.getSceneSubMap(XPathScraperConfigSceneTags)
+}
+
+func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
+ return s.getSceneSubMap(XPathScraperConfigSceneStudio)
+}
+
+func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
+ var ret models.ScrapedPerformer
+
+ performerMap := s.Performer
+ if performerMap == nil {
+ return nil, nil
+ }
+
+ results := performerMap.process(doc, s.Common)
+ if len(results) > 0 {
+ results[0].apply(&ret)
+ }
+
+ return &ret, nil
+}
+
+func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
+ var ret models.ScrapedScene
+
+ sceneMap := s.GetSceneSimple()
+ if sceneMap == nil {
+ return nil, nil
+ }
+
+ scenePerformersMap := s.GetScenePerformers()
+ sceneTagsMap := s.GetSceneTags()
+ sceneStudioMap := s.GetSceneStudio()
+
+ results := sceneMap.process(doc, s.Common)
+ if len(results) > 0 {
+ results[0].apply(&ret)
+
+ // now apply the performers and tags
+ if scenePerformersMap != nil {
+ performerResults := scenePerformersMap.process(doc, s.Common)
+
+ for _, p := range performerResults {
+ performer := &models.ScrapedScenePerformer{}
+ p.apply(performer)
+ ret.Performers = append(ret.Performers, performer)
+ }
+ }
+
+ if sceneTagsMap != nil {
+ tagResults := sceneTagsMap.process(doc, s.Common)
+
+ for _, p := range tagResults {
+ tag := &models.ScrapedSceneTag{}
+ p.apply(tag)
+ ret.Tags = append(ret.Tags, tag)
+ }
+ }
+
+ if sceneStudioMap != nil {
+ studioResults := sceneStudioMap.process(doc, s.Common)
+
+ if len(studioResults) > 0 {
+ studio := &models.ScrapedSceneStudio{}
+ studioResults[0].apply(studio)
+ ret.Studio = studio
+ }
+ }
+ }
+
+ return &ret, nil
+}
+
+type xPathResult map[string]*html.Node
+
+func (r xPathResult) apply(dest interface{}) {
+ destVal := reflect.ValueOf(dest)
+
+ // dest should be a pointer
+ destVal = destVal.Elem()
+
+ for key, v := range r {
+ field := destVal.FieldByName(key)
+
+ if field.IsValid() {
+ value := htmlquery.InnerText(v)
+ value = strings.TrimSpace(value)
+
+ // remove multiple whitespace and end lines
+ re := regexp.MustCompile("\n")
+ value = re.ReplaceAllString(value, "")
+ re = regexp.MustCompile(" +")
+ value = re.ReplaceAllString(value, " ")
+
+ var reflectValue reflect.Value
+ if field.Kind() == reflect.Ptr {
+ reflectValue = reflect.ValueOf(&value)
+ } else {
+ reflectValue = reflect.ValueOf(value)
+ }
+
+ field.Set(reflectValue)
+ } else {
+ logger.Errorf("Field %s does not exist in %T", key, dest)
+ }
+ }
+}
+
+func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
+ scraper := c.scraperConfig.XPathScrapers[c.Scraper]
+
+ if scraper == nil {
+ return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
+ }
+
+ doc, err := htmlquery.LoadURL(url)
+
+ if err != nil {
+ return nil, err
+ }
+
+ return scraper.scrapePerformer(doc)
+}
+
+func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
+ scraper := c.scraperConfig.XPathScrapers[c.Scraper]
+
+ if scraper == nil {
+ return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
+ }
+
+ doc, err := htmlquery.LoadURL(url)
+
+ if err != nil {
+ return nil, err
+ }
+
+ return scraper.scrapeScene(doc)
+}
diff --git a/pkg/scraper/xpath_test.go b/pkg/scraper/xpath_test.go
new file mode 100644
index 000000000..45b6eca2b
--- /dev/null
+++ b/pkg/scraper/xpath_test.go
@@ -0,0 +1,732 @@
+package scraper
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/antchfx/htmlquery"
+ "github.com/stashapp/stash/pkg/models"
+ "gopkg.in/yaml.v2"
+)
+
+// adapted from https://www.freeones.com/html/m_links/bio_Mia_Malkova.php
+const htmlDoc1 = `
+
+
+
+ Freeones: Mia Malkova Biography
+
+
+
+
+
+
+
+ |
+ Babe Name:
+ |
+
+ Mia Malkova
+ |
+
+
+ |
+ Profession:
+ |
+ Porn Star
+ |
+
+
+ |
+ Ethnicity:
+ |
+
+ Caucasian
+ |
+
+
+ |
+ Country of Origin:
+ |
+
+
+
+
+ United States
+
+ |
+
+
+ |
+ Date of Birth:
+ |
+
+ July 1, 1992 (27 years old)
+ |
+
+
+ |
+ Aliases:
+ |
+
+ Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica
+ |
+
+
+ |
+ Eye Color:
+ |
+
+ Hazel
+ |
+
+
+ |
+ Hair Color:
+ |
+
+ Blonde
+ |
+
+
+ |
+ Height:
+ |
+
+
+ |
+
+
+ |
+ Measurements:
+ |
+
+ 34C-26-36
+ |
+
+
+ |
+ Fake boobs:
+ |
+
+ No
+ |
+
+
+ |
+ Career Start And End
+ |
+
+ 2012 - 2019
+ (7 Years In The Business)
+ |
+
+
+ |
+ Tattoos:
+ |
+
+ None
+ |
+
+
+ |
+ Piercings:
+ |
+
+ None
+ |
+
+
+ |
+ Social Network Links:
+ |
+
+
+ |
+
+
+
+
+
+
+
+`
+
+func makeCommonXPath(attr string) string {
+ return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
+}
+
+func makeXPathConfig() xpathScraperConfig {
+ config := make(xpathScraperConfig)
+
+ config["Name"] = makeCommonXPath("Babe Name:") + `/a`
+ config["Ethnicity"] = makeCommonXPath("Ethnicity:")
+ config["Country"] = makeCommonXPath("Country of Origin:")
+ config["Birthdate"] = makeCommonXPath("Date of Birth:")
+ config["Aliases"] = makeCommonXPath("Aliases:")
+ config["EyeColor"] = makeCommonXPath("Eye Color:")
+ config["Measurements"] = makeCommonXPath("Measurements:")
+ config["FakeTits"] = makeCommonXPath("Fake boobs:")
+ config["Height"] = makeCommonXPath("Height:")
+ // no colon in attribute header
+ config["CareerLength"] = makeCommonXPath("Career Start And End")
+ config["Tattoos"] = makeCommonXPath("Tattoos:")
+ config["Piercings"] = makeCommonXPath("Piercings:")
+
+ return config
+}
+
+func verifyField(t *testing.T, expected string, actual *string, field string) {
+ t.Helper()
+
+ if actual == nil || *actual != expected {
+ if actual == nil {
+ t.Errorf("Expected %s to be set to %s, instead got nil", field, expected)
+ } else {
+ t.Errorf("Expected %s to be set to %s, instead got %s", field, expected, *actual)
+ }
+ }
+}
+
+func TestScrapePerformerXPath(t *testing.T) {
+ reader := strings.NewReader(htmlDoc1)
+ doc, err := htmlquery.Parse(reader)
+
+ if err != nil {
+ t.Errorf("Error loading document: %s", err.Error())
+ return
+ }
+
+ xpathConfig := makeXPathConfig()
+
+ scraper := xpathScraper{
+ Performer: xpathConfig,
+ }
+
+ performer, err := scraper.scrapePerformer(doc)
+
+ if err != nil {
+ t.Errorf("Error scraping performer: %s", err.Error())
+ return
+ }
+
+ const performerName = "Mia Malkova"
+ const ethnicity = "Caucasian"
+ const country = "United States"
+ const birthdate = "July 1, 1992 (27 years old)"
+ const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica"
+ const eyeColor = "Hazel"
+ const measurements = "34C-26-36"
+ const fakeTits = "No"
+ const careerLength = "2012 - 2019"
+ const tattoosPiercings = "None"
+
+ verifyField(t, performerName, performer.Name, "Name")
+ verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
+ verifyField(t, country, performer.Country, "Country")
+ verifyField(t, birthdate, performer.Birthdate, "Birthdate")
+ verifyField(t, aliases, performer.Aliases, "Aliases")
+ verifyField(t, eyeColor, performer.EyeColor, "EyeColor")
+ verifyField(t, measurements, performer.Measurements, "Measurements")
+ verifyField(t, fakeTits, performer.FakeTits, "FakeTits")
+
+ // TODO - this needs post-processing
+ //verifyField(t, careerLength, performer.CareerLength, "CareerLength")
+
+ verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
+ verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
+}
+
+const sceneHTML = `
+
+
+
+ Test Video - Pornhub.com
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Test Video
+
+
+
+
+
+
+
+
+ From:
+
+
+
+
+ - 87 videos
+
459466
+
+
+
+
+
+
+
+
+
+
+
+ Added on: 2 months ago
+
+
+
+ Featured on: 1 month ago
+
+
+
+
+
+
Jump to your favorite action
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+