Generic performer scrapers (#203)

* Generalise scraper API

* Add script performer scraper

* Fixes from testing

* Add context to scrapers and generalise

* Add scraping performer from URL

* Add error handling

* Move log to debug

* Add supported scrape types
This commit is contained in:
WithoutPants
2019-11-19 13:49:05 +11:00
committed by Leopere
parent 9bfa4e7560
commit 17247060b6
16 changed files with 836 additions and 132 deletions

View File

@@ -8,7 +8,6 @@ import (
"github.com/99designs/gqlgen/graphql"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/scraper"
)
type Resolver struct{}
@@ -161,14 +160,6 @@ func (r *queryResolver) SceneMarkerTags(ctx context.Context, scene_id string) ([
return result, nil
}
func (r *queryResolver) ScrapeFreeones(ctx context.Context, performer_name string) (*models.ScrapedPerformer, error) {
return scraper.GetPerformer(performer_name)
}
func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query string) ([]string, error) {
return scraper.GetPerformerNames(query)
}
// wasFieldIncluded returns true if the given field was included in the request.
// Slices are unmarshalled to empty slices even if the field was omitted. This
// method determines if it was omitted altogether.

View File

@@ -0,0 +1,53 @@
package api
import (
"context"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/scraper"
)
// deprecated
func (r *queryResolver) ScrapeFreeones(ctx context.Context, performer_name string) (*models.ScrapedPerformer, error) {
scrapedPerformer := models.ScrapedPerformerInput{
Name: &performer_name,
}
return scraper.GetFreeonesScraper().ScrapePerformer(scrapedPerformer)
}
// deprecated
func (r *queryResolver) ScrapeFreeonesPerformerList(ctx context.Context, query string) ([]string, error) {
scrapedPerformers, err := scraper.GetFreeonesScraper().ScrapePerformerNames(query)
if err != nil {
return nil, err
}
var ret []string
for _, v := range scrapedPerformers {
name := v.Name
ret = append(ret, *name)
}
return ret, nil
}
func (r *queryResolver) ListScrapers(ctx context.Context, scraperType models.ScraperType) ([]*models.Scraper, error) {
return scraper.ListScrapers(scraperType)
}
func (r *queryResolver) ScrapePerformerList(ctx context.Context, scraperID string, query string) ([]*models.ScrapedPerformer, error) {
if query == "" {
return nil, nil
}
return scraper.ScrapePerformerList(scraperID, query)
}
func (r *queryResolver) ScrapePerformer(ctx context.Context, scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return scraper.ScrapePerformer(scraperID, scrapedPerformer)
}
func (r *queryResolver) ScrapePerformerURL(ctx context.Context, url string) (*models.ScrapedPerformer, error) {
return scraper.ScrapePerformerURL(url)
}

View File

@@ -22,6 +22,8 @@ const Password = "password"
const Database = "database"
const ScrapersPath = "scrapers_path"
const MaxTranscodeSize = "max_transcode_size"
const MaxStreamingTranscodeSize = "max_streaming_transcode_size"
@@ -73,6 +75,20 @@ func GetDatabasePath() string {
return viper.GetString(Database)
}
func GetDefaultScrapersPath() string {
// default to the same directory as the config file
configFileUsed := viper.ConfigFileUsed()
configDir := filepath.Dir(configFileUsed)
fn := filepath.Join(configDir, "scrapers")
return fn
}
func GetScrapersPath() string {
return viper.GetString(ScrapersPath)
}
func GetHost() string {
return viper.GetString(Host)
}

View File

@@ -71,6 +71,9 @@ func initConfig() {
// Set generated to the metadata path for backwards compat
viper.SetDefault(config.Generated, viper.GetString(config.Metadata))
// Set default scrapers path
viper.SetDefault(config.ScrapersPath, config.GetDefaultScrapersPath())
// Disabling config watching due to race condition issue
// See: https://github.com/spf13/viper/issues/174
// Changes to the config outside the system will require a restart

View File

@@ -2,17 +2,38 @@ package scraper
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
func GetPerformerNames(q string) ([]string, error) {
const freeonesScraperID = "builtin_freeones"
const freeonesName = "Freeones"
var freeonesURLs = []string{
"freeones.com",
}
func GetFreeonesScraper() scraperConfig {
return scraperConfig{
ID: freeonesScraperID,
Name: "Freeones",
Type: models.ScraperTypePerformer,
Method: ScraperMethodBuiltin,
URLs: freeonesURLs,
scrapePerformerNamesFunc: GetPerformerNames,
scrapePerformerFunc: GetPerformer,
scrapePerformerURLFunc: GetPerformerURL,
}
}
func GetPerformerNames(c scraperConfig, q string) ([]*models.ScrapedPerformer, error) {
// Request the HTML page.
queryURL := "https://www.freeones.com/suggestions.php?q=" + url.PathEscape(q) + "&t=1"
res, err := http.Get(queryURL)
@@ -31,65 +52,42 @@ func GetPerformerNames(q string) ([]string, error) {
}
// Find the performers
var performerNames []string
var performers []*models.ScrapedPerformer
doc.Find(".suggestion").Each(func(i int, s *goquery.Selection) {
name := strings.Trim(s.Text(), " ")
performerNames = append(performerNames, name)
p := models.ScrapedPerformer{
Name: &name,
}
performers = append(performers, &p)
})
return performerNames, nil
return performers, nil
}
func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
res, err := http.Get(queryURL)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
func GetPerformerURL(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
// if we're already in the bio page, just scrape it
if regexp.MustCompile(`\/bio_.*\.php$`).MatchString(href) {
return getPerformerBio(c, href)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
// otherwise try to get the bio page from the url
profileRE := regexp.MustCompile(`_links\/(.*?)\/$`)
if profileRE.MatchString(href) {
href = profileRE.ReplaceAllString(href, "_links/bio_$1.php")
return getPerformerBio(c, href)
}
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
href, _ := s.Attr("href")
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
return false
}
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
return true
}
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First();
if strings.Contains( strings.ToLower(alias.Text()), strings.ToLower(performerName) ) {
return true
}
return false
})
return nil, nil
}
href, _ := performerLink.Attr("href")
href = strings.TrimSuffix(href, "/")
regex := regexp.MustCompile(`.+_links\/(.+)`)
matches := regex.FindStringSubmatch(href)
if len(matches) < 2 {
return nil, fmt.Errorf("No matches found in %s",href)
}
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
href = "https://www.freeones.com" + href
func getPerformerBio(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
bioRes, err := http.Get(href)
if err != nil {
return nil, err
}
defer bioRes.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
if bioRes.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", bioRes.StatusCode, bioRes.Status)
}
// Load the HTML document
@@ -175,6 +173,57 @@ func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
return &result, nil
}
func GetPerformer(c scraperConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
if scrapedPerformer.Name == nil {
return nil, nil
}
performerName := *scrapedPerformer.Name
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
res, err := http.Get(queryURL)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
}
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
href, _ := s.Attr("href")
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
return false
}
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
return true
}
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First()
if strings.Contains(strings.ToLower(alias.Text()), strings.ToLower(performerName)) {
return true
}
return false
})
href, _ := performerLink.Attr("href")
href = strings.TrimSuffix(href, "/")
regex := regexp.MustCompile(`.+_links\/(.+)`)
matches := regex.FindStringSubmatch(href)
if len(matches) < 2 {
return nil, fmt.Errorf("No matches found in %s", href)
}
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
href = "https://www.freeones.com" + href
return getPerformerBio(c, href)
}
func getIndexes(doc *goquery.Document) map[string]int {
var indexes = make(map[string]int)
doc.Find(".paramname").Each(func(i int, s *goquery.Selection) {
@@ -236,7 +285,7 @@ func paramValue(params *goquery.Selection, paramIndex int) string {
return content
}
node = node.NextSibling
if (node == nil) {
if node == nil {
return ""
}
return trim(node.FirstChild.Data)

318
pkg/scraper/scrapers.go Normal file
View File

@@ -0,0 +1,318 @@
package scraper
import (
"encoding/json"
"errors"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strings"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)
type ScraperMethod string
const (
ScraperMethodScript ScraperMethod = "SCRIPT"
ScraperMethodBuiltin ScraperMethod = "BUILTIN"
)
var AllScraperMethod = []ScraperMethod{
ScraperMethodScript,
}
func (e ScraperMethod) IsValid() bool {
switch e {
case ScraperMethodScript:
return true
}
return false
}
type scraperConfig struct {
ID string `json:"id"`
Name string `json:"name"`
Type models.ScraperType `json:"type"`
Method ScraperMethod `json:"method"`
URLs []string `json:"urls"`
GetPerformerNames []string `json:"get_performer_names"`
GetPerformer []string `json:"get_performer"`
GetPerformerURL []string `json:"get_performer_url"`
scrapePerformerNamesFunc func(c scraperConfig, name string) ([]*models.ScrapedPerformer, error)
scrapePerformerFunc func(c scraperConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error)
scrapePerformerURLFunc func(c scraperConfig, url string) (*models.ScrapedPerformer, error)
}
func (c scraperConfig) toScraper() *models.Scraper {
ret := models.Scraper{
ID: c.ID,
Name: c.Name,
Type: c.Type,
Urls: c.URLs,
}
// determine supported actions
if len(c.URLs) > 0 {
ret.SupportedScrapes = append(ret.SupportedScrapes, models.ScrapeTypeURL)
}
if c.scrapePerformerNamesFunc != nil && c.scrapePerformerFunc != nil {
ret.SupportedScrapes = append(ret.SupportedScrapes, models.ScrapeTypeQuery)
}
return &ret
}
func (c *scraperConfig) postDecode() {
if c.Method == ScraperMethodScript {
// only set scrape performer names/performer if the applicable field is set
if len(c.GetPerformer) > 0 && len(c.GetPerformerNames) > 0 {
c.scrapePerformerNamesFunc = scrapePerformerNamesScript
c.scrapePerformerFunc = scrapePerformerScript
}
c.scrapePerformerURLFunc = scrapePerformerURLScript
}
}
func (c scraperConfig) ScrapePerformerNames(name string) ([]*models.ScrapedPerformer, error) {
return c.scrapePerformerNamesFunc(c, name)
}
func (c scraperConfig) ScrapePerformer(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return c.scrapePerformerFunc(c, scrapedPerformer)
}
func (c scraperConfig) ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
return c.scrapePerformerURLFunc(c, url)
}
func runScraperScript(command []string, inString string, out interface{}) error {
cmd := exec.Command(command[0], command[1:]...)
cmd.Dir = config.GetScrapersPath()
stdin, err := cmd.StdinPipe()
if err != nil {
return err
}
go func() {
defer stdin.Close()
io.WriteString(stdin, inString)
}()
stderr, err := cmd.StderrPipe()
if err != nil {
logger.Error("Scraper stderr not available: " + err.Error())
}
stdout, err := cmd.StdoutPipe()
if nil != err {
logger.Error("Scraper stdout not available: " + err.Error())
}
if err = cmd.Start(); err != nil {
return errors.New("Error running scraper script")
}
// TODO - add a timeout here
decodeErr := json.NewDecoder(stdout).Decode(out)
stderrData, _ := ioutil.ReadAll(stderr)
stderrString := string(stderrData)
err = cmd.Wait()
if err != nil {
// error message should be in the stderr stream
logger.Errorf("scraper error when running command <%s>: %s", strings.Join(cmd.Args, " "), stderrString)
return errors.New("Error running scraper script")
}
if decodeErr != nil {
logger.Errorf("error decoding performer from scraper data: %s", err.Error())
return errors.New("Error decoding performer from scraper script")
}
return nil
}
func scrapePerformerNamesScript(c scraperConfig, name string) ([]*models.ScrapedPerformer, error) {
inString := `{"name": "` + name + `"}`
var performers []models.ScrapedPerformer
err := runScraperScript(c.GetPerformerNames, inString, &performers)
// convert to pointers
var ret []*models.ScrapedPerformer
if err == nil {
for i := 0; i < len(performers); i++ {
ret = append(ret, &performers[i])
}
}
return ret, err
}
func scrapePerformerScript(c scraperConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
inString, err := json.Marshal(scrapedPerformer)
if err != nil {
return nil, err
}
var ret models.ScrapedPerformer
err = runScraperScript(c.GetPerformer, string(inString), &ret)
return &ret, err
}
func scrapePerformerURLScript(c scraperConfig, url string) (*models.ScrapedPerformer, error) {
inString := `{"url": "` + url + `"}`
var ret models.ScrapedPerformer
err := runScraperScript(c.GetPerformerURL, string(inString), &ret)
return &ret, err
}
var scrapers []scraperConfig
func loadScraper(path string) (*scraperConfig, error) {
var scraper scraperConfig
file, err := os.Open(path)
defer file.Close()
if err != nil {
return nil, err
}
jsonParser := json.NewDecoder(file)
err = jsonParser.Decode(&scraper)
if err != nil {
return nil, err
}
// set id to the filename
id := filepath.Base(path)
id = id[:strings.LastIndex(id, ".")]
scraper.ID = id
scraper.postDecode()
return &scraper, nil
}
func loadScrapers() ([]scraperConfig, error) {
if scrapers != nil {
return scrapers, nil
}
path := config.GetScrapersPath()
scrapers = make([]scraperConfig, 0)
logger.Debugf("Reading scraper configs from %s", path)
scraperFiles, err := filepath.Glob(filepath.Join(path, "*.json"))
if err != nil {
logger.Errorf("Error reading scraper configs: %s", err.Error())
return nil, err
}
// add built-in freeones scraper
scrapers = append(scrapers, GetFreeonesScraper())
for _, file := range scraperFiles {
scraper, err := loadScraper(file)
if err != nil {
logger.Errorf("Error loading scraper %s: %s", file, err.Error())
} else {
scrapers = append(scrapers, *scraper)
}
}
return scrapers, nil
}
func ListScrapers(scraperType models.ScraperType) ([]*models.Scraper, error) {
// read scraper config files from the directory and cache
scrapers, err := loadScrapers()
if err != nil {
return nil, err
}
var ret []*models.Scraper
for _, s := range scrapers {
// filter on type
if s.Type == scraperType {
ret = append(ret, s.toScraper())
}
}
return ret, nil
}
func findPerformerScraper(scraperID string) *scraperConfig {
// read scraper config files from the directory and cache
loadScrapers()
for _, s := range scrapers {
if s.ID == scraperID {
return &s
}
}
return nil
}
func findPerformerScraperURL(url string) *scraperConfig {
// read scraper config files from the directory and cache
loadScrapers()
for _, s := range scrapers {
for _, thisURL := range s.URLs {
if strings.Contains(url, thisURL) {
return &s
}
}
}
return nil
}
func ScrapePerformerList(scraperID string, query string) ([]*models.ScrapedPerformer, error) {
// find scraper with the provided id
s := findPerformerScraper(scraperID)
if s != nil {
return s.ScrapePerformerNames(query)
}
return nil, errors.New("Scraper with ID " + scraperID + " not found")
}
func ScrapePerformer(scraperID string, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
// find scraper with the provided id
s := findPerformerScraper(scraperID)
if s != nil {
return s.ScrapePerformer(scrapedPerformer)
}
return nil, errors.New("Scraper with ID " + scraperID + " not found")
}
func ScrapePerformerURL(url string) (*models.ScrapedPerformer, error) {
// find scraper that matches the url given
s := findPerformerScraperURL(url)
if s != nil {
return s.ScrapePerformerURL(url)
}
return nil, nil
}