Generic performer scrapers (#203)

* Generalise scraper API

* Add script performer scraper

* Fixes from testing

* Add context to scrapers and generalise

* Add scraping performer from URL

* Add error handling

* Move log to debug

* Add supported scrape types
This commit is contained in:
WithoutPants
2019-11-19 13:49:05 +11:00
committed by Leopere
parent 9bfa4e7560
commit 17247060b6
16 changed files with 836 additions and 132 deletions

View File

@@ -2,17 +2,38 @@ package scraper
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
func GetPerformerNames(q string) ([]string, error) {
const freeonesScraperID = "builtin_freeones"
const freeonesName = "Freeones"
var freeonesURLs = []string{
"freeones.com",
}
func GetFreeonesScraper() scraperConfig {
return scraperConfig{
ID: freeonesScraperID,
Name: "Freeones",
Type: models.ScraperTypePerformer,
Method: ScraperMethodBuiltin,
URLs: freeonesURLs,
scrapePerformerNamesFunc: GetPerformerNames,
scrapePerformerFunc: GetPerformer,
scrapePerformerURLFunc: GetPerformerURL,
}
}
func GetPerformerNames(c scraperConfig, q string) ([]*models.ScrapedPerformer, error) {
// Request the HTML page.
queryURL := "https://www.freeones.com/suggestions.php?q=" + url.PathEscape(q) + "&t=1"
res, err := http.Get(queryURL)
@@ -31,65 +52,42 @@ func GetPerformerNames(q string) ([]string, error) {
}
// Find the performers
var performerNames []string
var performers []*models.ScrapedPerformer
doc.Find(".suggestion").Each(func(i int, s *goquery.Selection) {
name := strings.Trim(s.Text(), " ")
performerNames = append(performerNames, name)
p := models.ScrapedPerformer{
Name: &name,
}
performers = append(performers, &p)
})
return performerNames, nil
return performers, nil
}
func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
res, err := http.Get(queryURL)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
func GetPerformerURL(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
// if we're already in the bio page, just scrape it
if regexp.MustCompile(`\/bio_.*\.php$`).MatchString(href) {
return getPerformerBio(c, href)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
// otherwise try to get the bio page from the url
profileRE := regexp.MustCompile(`_links\/(.*?)\/$`)
if profileRE.MatchString(href) {
href = profileRE.ReplaceAllString(href, "_links/bio_$1.php")
return getPerformerBio(c, href)
}
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
href, _ := s.Attr("href")
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
return false
}
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
return true
}
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First();
if strings.Contains( strings.ToLower(alias.Text()), strings.ToLower(performerName) ) {
return true
}
return false
})
return nil, nil
}
href, _ := performerLink.Attr("href")
href = strings.TrimSuffix(href, "/")
regex := regexp.MustCompile(`.+_links\/(.+)`)
matches := regex.FindStringSubmatch(href)
if len(matches) < 2 {
return nil, fmt.Errorf("No matches found in %s",href)
}
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
href = "https://www.freeones.com" + href
func getPerformerBio(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
bioRes, err := http.Get(href)
if err != nil {
return nil, err
}
defer bioRes.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
if bioRes.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", bioRes.StatusCode, bioRes.Status)
}
// Load the HTML document
@@ -175,6 +173,57 @@ func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
return &result, nil
}
func GetPerformer(c scraperConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
if scrapedPerformer.Name == nil {
return nil, nil
}
performerName := *scrapedPerformer.Name
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
res, err := http.Get(queryURL)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
}
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
href, _ := s.Attr("href")
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
return false
}
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
return true
}
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First()
if strings.Contains(strings.ToLower(alias.Text()), strings.ToLower(performerName)) {
return true
}
return false
})
href, _ := performerLink.Attr("href")
href = strings.TrimSuffix(href, "/")
regex := regexp.MustCompile(`.+_links\/(.+)`)
matches := regex.FindStringSubmatch(href)
if len(matches) < 2 {
return nil, fmt.Errorf("No matches found in %s", href)
}
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
href = "https://www.freeones.com" + href
return getPerformerBio(c, href)
}
func getIndexes(doc *goquery.Document) map[string]int {
var indexes = make(map[string]int)
doc.Find(".paramname").Each(func(i int, s *goquery.Selection) {
@@ -236,7 +285,7 @@ func paramValue(params *goquery.Selection, paramIndex int) string {
return content
}
node = node.NextSibling
if (node == nil) {
if node == nil {
return ""
}
return trim(node.FirstChild.Data)