mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Add JSON scrape support (#717)
* Add support for scene fragment scrape in xpath
This commit is contained in:
@@ -8,17 +8,19 @@ const (
|
||||
scraperActionScript scraperAction = "script"
|
||||
scraperActionStash scraperAction = "stash"
|
||||
scraperActionXPath scraperAction = "scrapeXPath"
|
||||
scraperActionJson scraperAction = "scrapeJson"
|
||||
)
|
||||
|
||||
var allScraperAction = []scraperAction{
|
||||
scraperActionScript,
|
||||
scraperActionStash,
|
||||
scraperActionXPath,
|
||||
scraperActionJson,
|
||||
}
|
||||
|
||||
func (e scraperAction) IsValid() bool {
|
||||
switch e {
|
||||
case scraperActionScript, scraperActionStash, scraperActionXPath:
|
||||
case scraperActionScript, scraperActionStash, scraperActionXPath, scraperActionJson:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -47,6 +49,8 @@ func getScraper(scraper scraperTypeConfig, config config, globalConfig GlobalCon
|
||||
return newStashScraper(scraper, config, globalConfig)
|
||||
case scraperActionXPath:
|
||||
return newXpathScraper(scraper, config, globalConfig)
|
||||
case scraperActionJson:
|
||||
return newJsonScraper(scraper, config, globalConfig)
|
||||
}
|
||||
|
||||
panic("unknown scraper action: " + scraper.Action)
|
||||
|
||||
@@ -44,6 +44,9 @@ type config struct {
|
||||
// Xpath scraping configurations
|
||||
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`
|
||||
|
||||
// Json scraping configurations
|
||||
JsonScrapers mappedScrapers `yaml:"jsonScrapers"`
|
||||
|
||||
// Scraping driver options
|
||||
DriverOptions *scraperDriverOptions `yaml:"driver"`
|
||||
}
|
||||
|
||||
191
pkg/scraper/json.go
Normal file
191
pkg/scraper/json.go
Normal file
@@ -0,0 +1,191 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
"github.com/tidwall/gjson"
|
||||
)
|
||||
|
||||
type jsonScraper struct {
|
||||
scraper scraperTypeConfig
|
||||
config config
|
||||
globalConfig GlobalConfig
|
||||
}
|
||||
|
||||
func newJsonScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *jsonScraper {
|
||||
return &jsonScraper{
|
||||
scraper: scraper,
|
||||
config: config,
|
||||
globalConfig: globalConfig,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *jsonScraper) getJsonScraper() *mappedScraper {
|
||||
return s.config.JsonScrapers[s.scraper.Scraper]
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapeURL(url string) (string, *mappedScraper, error) {
|
||||
scraper := s.getJsonScraper()
|
||||
|
||||
if scraper == nil {
|
||||
return "", nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := s.loadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
return doc, scraper, nil
|
||||
}
|
||||
|
||||
func (s *jsonScraper) loadURL(url string) (string, error) {
|
||||
r, err := loadURL(url, s.config, s.globalConfig)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
doc, err := ioutil.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
docStr := string(doc)
|
||||
if !gjson.Valid(docStr) {
|
||||
return "", errors.New("not valid json")
|
||||
}
|
||||
|
||||
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
|
||||
logger.Infof("loadURL (%s) response: \n%s", url, docStr)
|
||||
}
|
||||
|
||||
return docStr, err
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
|
||||
doc, scraper, err := s.scrapeURL(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := s.getJsonQuery(doc)
|
||||
return scraper.scrapePerformer(q)
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
|
||||
doc, scraper, err := s.scrapeURL(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := s.getJsonQuery(doc)
|
||||
return scraper.scrapeScene(q)
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
|
||||
scraper := s.getJsonScraper()
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
const placeholder = "{}"
|
||||
|
||||
// replace the placeholder string with the URL-escaped name
|
||||
escapedName := url.QueryEscape(name)
|
||||
|
||||
url := s.scraper.QueryURL
|
||||
url = strings.Replace(url, placeholder, escapedName, -1)
|
||||
|
||||
doc, err := s.loadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := s.getJsonQuery(doc)
|
||||
return scraper.scrapePerformers(q)
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||
return nil, errors.New("scrapePerformerByFragment not supported for json scraper")
|
||||
}
|
||||
|
||||
func (s *jsonScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||
storedScene, err := sceneFromUpdateFragment(scene)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if storedScene == nil {
|
||||
return nil, errors.New("no scene found")
|
||||
}
|
||||
|
||||
// construct the URL
|
||||
url := constructSceneURL(s.scraper.QueryURL, storedScene)
|
||||
|
||||
scraper := s.getJsonScraper()
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := s.loadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := s.getJsonQuery(doc)
|
||||
return scraper.scrapeScene(q)
|
||||
}
|
||||
|
||||
func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery {
|
||||
return &jsonQuery{
|
||||
doc: doc,
|
||||
scraper: s,
|
||||
}
|
||||
}
|
||||
|
||||
type jsonQuery struct {
|
||||
doc string
|
||||
scraper *jsonScraper
|
||||
}
|
||||
|
||||
func (q *jsonQuery) runQuery(selector string) []string {
|
||||
value := gjson.Get(q.doc, selector)
|
||||
|
||||
if !value.Exists() {
|
||||
logger.Warnf("Could not find json path '%s' in json object", selector)
|
||||
return nil
|
||||
}
|
||||
|
||||
var ret []string
|
||||
if value.IsArray() {
|
||||
value.ForEach(func(k, v gjson.Result) bool {
|
||||
ret = append(ret, v.String())
|
||||
return true
|
||||
})
|
||||
} else {
|
||||
ret = append(ret, value.String())
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func (q *jsonQuery) subScrape(value string) mappedQuery {
|
||||
doc, err := q.scraper.loadURL(value)
|
||||
|
||||
if err != nil {
|
||||
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
||||
return nil
|
||||
}
|
||||
|
||||
return q.scraper.getJsonQuery(doc)
|
||||
}
|
||||
93
pkg/scraper/json_test.go
Normal file
93
pkg/scraper/json_test.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
func TestJsonPerformerScraper(t *testing.T) {
|
||||
const yamlStr = `name: Test
|
||||
jsonScrapers:
|
||||
performerScraper:
|
||||
common:
|
||||
$extras: data.extras
|
||||
performer:
|
||||
Name: data.name
|
||||
Gender: $extras.gender
|
||||
Birthdate: $extras.birthday
|
||||
Ethnicity: $extras.ethnicity
|
||||
Height: $extras.height
|
||||
Measurements: $extras.measurements
|
||||
Tattoos: $extras.tattoos
|
||||
Piercings: $extras.piercings
|
||||
Aliases: data.aliases
|
||||
Image: data.image
|
||||
`
|
||||
|
||||
const json = `
|
||||
{
|
||||
"data": {
|
||||
"id": "2cd4146b-637d-49b1-8ff9-19d4a06947bb",
|
||||
"name": "Mia Malkova",
|
||||
"bio": "Some girls are so damn hot that they can get you bent out of shape, and you will not even be mad at them for doing so. Well, tawny blonde Mia Malkova can bend her body into any shape she pleases, and that’s sure to satisfy all of the horny cocks and wet pussies out there. This girl has acrobatic and contortionist abilities that could even twist a pretzel into a new knot, which can be very helpful in the ... arrow_drop_down Some girls are so damn hot that they can get you bent out of shape, and you will not even be mad at them for doing so. Well, tawny blonde Mia Malkova can bend her body into any shape she pleases, and that’s sure to satisfy all of the horny cocks and wet pussies out there. This girl has acrobatic and contortionist abilities that could even twist a pretzel into a new knot, which can be very helpful in the VR Porn movies – trust us. Ankles behind her neck and feet over her back so she can kiss her toes, turned, twisted and gyrating, she can fuck any which way she wants (and that ass!), will surely make you fall in love with this hot Virtual Reality Porn slut, as she is one of the finest of them all. Talking about perfection, maybe it’s all the acrobatic work that keeps it in such gorgeous shape? Who cares really, because you just want to take a big bite out of it and never let go. But it’s not all about the body. Mia’s also got a great smile, which might not sound kinky, but believe us, it is a smile that will heat up your innards and drop your pants. Is it her golden skin, her innocent pink lips or that heart-shaped face? There is just too much good stuff going on with Mia Malkova, which is maybe why these past few years have heaped awards upon awards on this Southern California native. Mia came to VR Bangers for her first VR Porn video, so you know she’s only going for top-notch scenes with top-game performers, men, and women. Better hit up that yoga studio if you ever dream of being able to bang a flexible and talented chick like lady Malkova. arrow_drop_up",
|
||||
"extras": {
|
||||
"gender": "Female",
|
||||
"birthday": "1992-07-01",
|
||||
"birthday_timestamp": 709948800,
|
||||
"birthplace": "Palm Springs, California, United States",
|
||||
"active": 1,
|
||||
"astrology": "Cancer (Jun 21 - Jul 22)",
|
||||
"ethnicity": "Caucasian",
|
||||
"nationality": "United States",
|
||||
"hair_colour": "Blonde",
|
||||
"weight": "126 lbs (or 57 kg)",
|
||||
"height": "5'6\" (or 167 cm)",
|
||||
"measurements": "34-26-36",
|
||||
"cupsize": "34C (75C)",
|
||||
"tattoos": "None",
|
||||
"piercings": "Navel",
|
||||
"first_seen": null
|
||||
},
|
||||
"aliases": [
|
||||
"Mia Bliss",
|
||||
"Madison Clover",
|
||||
"Madison Swan",
|
||||
"Mia Mountain",
|
||||
"Mia M.",
|
||||
"Mia Malvoka",
|
||||
"Mia Molkova",
|
||||
"Mia Thomas"
|
||||
],
|
||||
"image": "https:\/\/thumb.metadataapi.net\/unsafe\/1000x1500\/smart\/filters:sharpen():upscale()\/https%3A%2F%2Fcdn.metadataapi.net%2Fperformer%2F49%2F05%2F30%2Fade2255dc065032a89ebb23f0e038fa%2Fposter%2Fmia-malkova.jpg%3Fid1582610531"
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
c := &config{}
|
||||
err := yaml.Unmarshal([]byte(yamlStr), &c)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Error loading yaml: %s", err.Error())
|
||||
}
|
||||
|
||||
// perform scrape using json string
|
||||
performerScraper := c.JsonScrapers["performerScraper"]
|
||||
|
||||
q := &jsonQuery{
|
||||
doc: json,
|
||||
}
|
||||
|
||||
scrapedPerformer, err := performerScraper.scrapePerformer(q)
|
||||
if err != nil {
|
||||
t.Fatalf("Error scraping performer: %s", err.Error())
|
||||
}
|
||||
|
||||
verifyField(t, "Mia Malkova", scrapedPerformer.Name, "Name")
|
||||
verifyField(t, "Female", scrapedPerformer.Gender, "Gender")
|
||||
verifyField(t, "1992-07-01", scrapedPerformer.Birthdate, "Birthdate")
|
||||
verifyField(t, "Caucasian", scrapedPerformer.Ethnicity, "Ethnicity")
|
||||
verifyField(t, "5'6\" (or 167 cm)", scrapedPerformer.Height, "Height")
|
||||
verifyField(t, "None", scrapedPerformer.Tattoos, "Tattoos")
|
||||
verifyField(t, "Navel", scrapedPerformer.Piercings, "Piercings")
|
||||
}
|
||||
@@ -191,3 +191,14 @@ func (s *stashScraper) scrapePerformerByURL(url string) (*models.ScrapedPerforme
|
||||
func (s *stashScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
|
||||
return nil, errors.New("scrapeSceneByURL not supported for stash scraper")
|
||||
}
|
||||
|
||||
func sceneFromUpdateFragment(scene models.SceneUpdateInput) (*models.Scene, error) {
|
||||
qb := models.NewSceneQueryBuilder()
|
||||
id, err := strconv.Atoi(scene.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// TODO - should we modify it with the input?
|
||||
return qb.Find(id)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -18,10 +19,25 @@ import (
|
||||
"github.com/chromedp/chromedp"
|
||||
jsoniter "github.com/json-iterator/go"
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
"golang.org/x/net/html/charset"
|
||||
"golang.org/x/net/publicsuffix"
|
||||
)
|
||||
|
||||
// Timeout for the scrape http request. Includes transfer time. May want to make this
|
||||
// configurable at some point.
|
||||
const scrapeGetTimeout = time.Second * 30
|
||||
|
||||
func constructSceneURL(url string, scene *models.Scene) string {
|
||||
// support checksum, title and filename
|
||||
ret := strings.Replace(url, "{checksum}", scene.Checksum.String, -1)
|
||||
ret = strings.Replace(url, "{oshash}", scene.OSHash.String, -1)
|
||||
ret = strings.Replace(ret, "{filename}", filepath.Base(scene.Path), -1)
|
||||
ret = strings.Replace(ret, "{title}", scene.Title.String, -1)
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
|
||||
driverOptions := scraperConfig.DriverOptions
|
||||
if driverOptions != nil && driverOptions.UseCDP {
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/antchfx/htmlquery"
|
||||
|
||||
@@ -16,10 +15,6 @@ import (
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
)
|
||||
|
||||
// Timeout for the scrape http request. Includes transfer time. May want to make this
|
||||
// configurable at some point.
|
||||
const scrapeGetTimeout = time.Second * 30
|
||||
|
||||
type xpathScraper struct {
|
||||
scraper scraperTypeConfig
|
||||
config config
|
||||
@@ -104,7 +99,32 @@ func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.Scraped
|
||||
}
|
||||
|
||||
func (s *xpathScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
|
||||
return nil, errors.New("scrapeSceneByFragment not supported for xpath scraper")
|
||||
storedScene, err := sceneFromUpdateFragment(scene)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if storedScene == nil {
|
||||
return nil, errors.New("no scene found")
|
||||
}
|
||||
|
||||
// construct the URL
|
||||
url := constructSceneURL(s.scraper.QueryURL, storedScene)
|
||||
|
||||
scraper := s.getXpathScraper()
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := s.loadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
q := s.getXPathQuery(doc)
|
||||
return scraper.scrapeScene(q)
|
||||
}
|
||||
|
||||
func (s *xpathScraper) loadURL(url string) (*html.Node, error) {
|
||||
|
||||
Reference in New Issue
Block a user