mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 04:14:39 +03:00
Add Xpath post processing and performer name query (#333)
* Extend xpath configuration. Support concatenation * Add parseDate parsing option * Add regex replacements * Add xpath query performer by name * Fix loading spinner on scrape performer * Change ReplaceAll to Replace
This commit is contained in:
@@ -2,9 +2,11 @@ package scraper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/url"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/antchfx/htmlquery"
|
||||
"golang.org/x/net/html"
|
||||
@@ -43,35 +45,209 @@ func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfi
|
||||
return ret
|
||||
}
|
||||
|
||||
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
|
||||
var ret []xPathResult
|
||||
type xpathRegexConfig map[interface{}]interface{}
|
||||
type xpathRegexConfigs []xpathRegexConfig
|
||||
|
||||
for k, v := range s {
|
||||
asStr, isStr := v.(string)
|
||||
func (c xpathRegexConfig) apply(value string) string {
|
||||
regex := ""
|
||||
with := ""
|
||||
|
||||
if isStr {
|
||||
// apply common
|
||||
if common != nil {
|
||||
asStr = common.applyCommon(asStr)
|
||||
}
|
||||
if regexI, _ := c["regex"]; regexI != nil {
|
||||
regex, _ = regexI.(string)
|
||||
}
|
||||
if withI, _ := c["with"]; withI != nil {
|
||||
with, _ = withI.(string)
|
||||
}
|
||||
|
||||
found, err := htmlquery.QueryAll(doc, asStr)
|
||||
if err != nil {
|
||||
logger.Warnf("Error parsing xpath expression '%s': %s", asStr, err.Error())
|
||||
continue
|
||||
}
|
||||
if regex != "" {
|
||||
re, err := regexp.Compile(regex)
|
||||
if err != nil {
|
||||
logger.Warnf("Error compiling regex '%s': %s", regex, err.Error())
|
||||
return value
|
||||
}
|
||||
|
||||
return re.ReplaceAllString(value, with)
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
func (c xpathRegexConfigs) apply(value string) string {
|
||||
// apply regex in order
|
||||
for _, config := range c {
|
||||
value = config.apply(value)
|
||||
}
|
||||
|
||||
// remove whitespace again
|
||||
value = commonPostProcess(value)
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
type xpathScraperAttrConfig map[interface{}]interface{}
|
||||
|
||||
func (c xpathScraperAttrConfig) getString(key string) string {
|
||||
ret, _ := c[key]
|
||||
|
||||
if ret == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
asStr, _ := ret.(string)
|
||||
return asStr
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) getSelector() string {
|
||||
const selectorKey = "selector"
|
||||
return c.getString(selectorKey)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) getConcat() string {
|
||||
const concatKey = "concat"
|
||||
return c.getString(concatKey)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) hasConcat() bool {
|
||||
return c.getConcat() != ""
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) getParseDate() string {
|
||||
const parseDateKey = "parseDate"
|
||||
return c.getString(parseDateKey)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
|
||||
const replaceKey = "replace"
|
||||
val, _ := c[replaceKey]
|
||||
|
||||
var ret xpathRegexConfigs
|
||||
if val == nil {
|
||||
return ret
|
||||
}
|
||||
|
||||
asSlice, _ := val.([]interface{})
|
||||
|
||||
for _, v := range asSlice {
|
||||
asMap, _ := v.(map[interface{}]interface{})
|
||||
ret = append(ret, xpathRegexConfig(asMap))
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string {
|
||||
separator := c.getConcat()
|
||||
result := []string{}
|
||||
|
||||
for _, elem := range nodes {
|
||||
text := htmlquery.InnerText(elem)
|
||||
text = commonPostProcess(text)
|
||||
|
||||
result = append(result, text)
|
||||
}
|
||||
|
||||
return strings.Join(result, separator)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) parseDate(value string) string {
|
||||
parseDate := c.getParseDate()
|
||||
|
||||
if parseDate == "" {
|
||||
return value
|
||||
}
|
||||
|
||||
// try to parse the date using the pattern
|
||||
// if it fails, then just fall back to the original value
|
||||
parsedValue, err := time.Parse(parseDate, value)
|
||||
if err != nil {
|
||||
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
|
||||
return value
|
||||
}
|
||||
|
||||
// convert it into our date format
|
||||
const internalDateFormat = "2006-01-02"
|
||||
return parsedValue.Format(internalDateFormat)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) replaceRegex(value string) string {
|
||||
replace := c.getReplace()
|
||||
return replace.apply(value)
|
||||
}
|
||||
|
||||
func (c xpathScraperAttrConfig) postProcess(value string) string {
|
||||
// perform regex replacements first
|
||||
value = c.replaceRegex(value)
|
||||
value = c.parseDate(value)
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
func commonPostProcess(value string) string {
|
||||
value = strings.TrimSpace(value)
|
||||
|
||||
// remove multiple whitespace and end lines
|
||||
re := regexp.MustCompile("\n")
|
||||
value = re.ReplaceAllString(value, "")
|
||||
re = regexp.MustCompile(" +")
|
||||
value = re.ReplaceAllString(value, " ")
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
|
||||
// apply common
|
||||
if common != nil {
|
||||
xpath = common.applyCommon(xpath)
|
||||
}
|
||||
|
||||
found, err := htmlquery.QueryAll(doc, xpath)
|
||||
if err != nil {
|
||||
logger.Warnf("Error parsing xpath expression '%s': %s", xpath, err.Error())
|
||||
return nil
|
||||
}
|
||||
|
||||
return found
|
||||
}
|
||||
|
||||
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xPathResults {
|
||||
var ret xPathResults
|
||||
|
||||
for k, value := range s {
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
found := runXPathQuery(doc, v, common)
|
||||
|
||||
if len(found) > 0 {
|
||||
for i, elem := range found {
|
||||
if i >= len(ret) {
|
||||
ret = append(ret, make(xPathResult))
|
||||
}
|
||||
text := htmlquery.InnerText(elem)
|
||||
text = commonPostProcess(text)
|
||||
|
||||
ret[i][k] = elem
|
||||
ret = ret.setKey(i, k, text)
|
||||
}
|
||||
}
|
||||
case map[interface{}]interface{}:
|
||||
attrConfig := xpathScraperAttrConfig(v)
|
||||
|
||||
found := runXPathQuery(doc, attrConfig.getSelector(), common)
|
||||
|
||||
if len(found) > 0 {
|
||||
// check if we're concatenating the results into a single result
|
||||
if attrConfig.hasConcat() {
|
||||
result := attrConfig.concatenateResults(found)
|
||||
result = attrConfig.postProcess(result)
|
||||
const i = 0
|
||||
ret = ret.setKey(i, k, result)
|
||||
} else {
|
||||
for i, elem := range found {
|
||||
text := htmlquery.InnerText(elem)
|
||||
text = commonPostProcess(text)
|
||||
text = attrConfig.postProcess(text)
|
||||
|
||||
ret = ret.setKey(i, k, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO - handle map type
|
||||
}
|
||||
|
||||
return ret
|
||||
@@ -153,6 +329,24 @@ func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer,
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
func (s xpathScraper) scrapePerformers(doc *html.Node) ([]*models.ScrapedPerformer, error) {
|
||||
var ret []*models.ScrapedPerformer
|
||||
|
||||
performerMap := s.Performer
|
||||
if performerMap == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
results := performerMap.process(doc, s.Common)
|
||||
for _, r := range results {
|
||||
var p models.ScrapedPerformer
|
||||
r.apply(&p)
|
||||
ret = append(ret, &p)
|
||||
}
|
||||
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
|
||||
var ret models.ScrapedScene
|
||||
|
||||
@@ -204,7 +398,8 @@ func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error)
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
type xPathResult map[string]*html.Node
|
||||
type xPathResult map[string]string
|
||||
type xPathResults []xPathResult
|
||||
|
||||
func (r xPathResult) apply(dest interface{}) {
|
||||
destVal := reflect.ValueOf(dest)
|
||||
@@ -212,22 +407,16 @@ func (r xPathResult) apply(dest interface{}) {
|
||||
// dest should be a pointer
|
||||
destVal = destVal.Elem()
|
||||
|
||||
for key, v := range r {
|
||||
for key, value := range r {
|
||||
field := destVal.FieldByName(key)
|
||||
|
||||
if field.IsValid() {
|
||||
value := htmlquery.InnerText(v)
|
||||
value = strings.TrimSpace(value)
|
||||
|
||||
// remove multiple whitespace and end lines
|
||||
re := regexp.MustCompile("\n")
|
||||
value = re.ReplaceAllString(value, "")
|
||||
re = regexp.MustCompile(" +")
|
||||
value = re.ReplaceAllString(value, " ")
|
||||
|
||||
var reflectValue reflect.Value
|
||||
if field.Kind() == reflect.Ptr {
|
||||
reflectValue = reflect.ValueOf(&value)
|
||||
// need to copy the value, otherwise everything is set to the
|
||||
// same pointer
|
||||
localValue := value
|
||||
reflectValue = reflect.ValueOf(&localValue)
|
||||
} else {
|
||||
reflectValue = reflect.ValueOf(value)
|
||||
}
|
||||
@@ -239,6 +428,15 @@ func (r xPathResult) apply(dest interface{}) {
|
||||
}
|
||||
}
|
||||
|
||||
func (r xPathResults) setKey(index int, key string, value string) xPathResults {
|
||||
if index >= len(r) {
|
||||
r = append(r, make(xPathResult))
|
||||
}
|
||||
|
||||
r[index][key] = value
|
||||
return r
|
||||
}
|
||||
|
||||
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
||||
|
||||
@@ -270,3 +468,27 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
|
||||
|
||||
return scraper.scrapeScene(doc)
|
||||
}
|
||||
|
||||
func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.ScrapedPerformer, error) {
|
||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
const placeholder = "{}"
|
||||
|
||||
// replace the placeholder string with the URL-escaped name
|
||||
escapedName := url.QueryEscape(name)
|
||||
|
||||
u := c.QueryURL
|
||||
u = strings.Replace(u, placeholder, escapedName, -1)
|
||||
|
||||
doc, err := htmlquery.LoadURL(u)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scraper.scrapePerformers(doc)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user