mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Generic performer scrapers (#203)
* Generalise scraper API * Add script performer scraper * Fixes from testing * Add context to scrapers and generalise * Add scraping performer from URL * Add error handling * Move log to debug * Add supported scrape types
This commit is contained in:
@@ -2,17 +2,38 @@ package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
)
|
||||
|
||||
func GetPerformerNames(q string) ([]string, error) {
|
||||
const freeonesScraperID = "builtin_freeones"
|
||||
const freeonesName = "Freeones"
|
||||
|
||||
var freeonesURLs = []string{
|
||||
"freeones.com",
|
||||
}
|
||||
|
||||
func GetFreeonesScraper() scraperConfig {
|
||||
return scraperConfig{
|
||||
ID: freeonesScraperID,
|
||||
Name: "Freeones",
|
||||
Type: models.ScraperTypePerformer,
|
||||
Method: ScraperMethodBuiltin,
|
||||
URLs: freeonesURLs,
|
||||
scrapePerformerNamesFunc: GetPerformerNames,
|
||||
scrapePerformerFunc: GetPerformer,
|
||||
scrapePerformerURLFunc: GetPerformerURL,
|
||||
}
|
||||
}
|
||||
|
||||
func GetPerformerNames(c scraperConfig, q string) ([]*models.ScrapedPerformer, error) {
|
||||
// Request the HTML page.
|
||||
queryURL := "https://www.freeones.com/suggestions.php?q=" + url.PathEscape(q) + "&t=1"
|
||||
res, err := http.Get(queryURL)
|
||||
@@ -31,65 +52,42 @@ func GetPerformerNames(q string) ([]string, error) {
|
||||
}
|
||||
|
||||
// Find the performers
|
||||
var performerNames []string
|
||||
var performers []*models.ScrapedPerformer
|
||||
doc.Find(".suggestion").Each(func(i int, s *goquery.Selection) {
|
||||
name := strings.Trim(s.Text(), " ")
|
||||
performerNames = append(performerNames, name)
|
||||
p := models.ScrapedPerformer{
|
||||
Name: &name,
|
||||
}
|
||||
performers = append(performers, &p)
|
||||
})
|
||||
|
||||
return performerNames, nil
|
||||
return performers, nil
|
||||
}
|
||||
|
||||
func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
|
||||
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
|
||||
res, err := http.Get(queryURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
|
||||
func GetPerformerURL(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
|
||||
// if we're already in the bio page, just scrape it
|
||||
if regexp.MustCompile(`\/bio_.*\.php$`).MatchString(href) {
|
||||
return getPerformerBio(c, href)
|
||||
}
|
||||
|
||||
// Load the HTML document
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
// otherwise try to get the bio page from the url
|
||||
profileRE := regexp.MustCompile(`_links\/(.*?)\/$`)
|
||||
if profileRE.MatchString(href) {
|
||||
href = profileRE.ReplaceAllString(href, "_links/bio_$1.php")
|
||||
return getPerformerBio(c, href)
|
||||
}
|
||||
|
||||
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
|
||||
href, _ := s.Attr("href")
|
||||
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
|
||||
return false
|
||||
}
|
||||
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
|
||||
return true
|
||||
}
|
||||
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First();
|
||||
if strings.Contains( strings.ToLower(alias.Text()), strings.ToLower(performerName) ) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
})
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
href, _ := performerLink.Attr("href")
|
||||
href = strings.TrimSuffix(href, "/")
|
||||
regex := regexp.MustCompile(`.+_links\/(.+)`)
|
||||
matches := regex.FindStringSubmatch(href)
|
||||
if len(matches) < 2 {
|
||||
return nil, fmt.Errorf("No matches found in %s",href)
|
||||
}
|
||||
|
||||
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
|
||||
href = "https://www.freeones.com" + href
|
||||
|
||||
func getPerformerBio(c scraperConfig, href string) (*models.ScrapedPerformer, error) {
|
||||
bioRes, err := http.Get(href)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer bioRes.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
|
||||
if bioRes.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("status code error: %d %s", bioRes.StatusCode, bioRes.Status)
|
||||
}
|
||||
|
||||
// Load the HTML document
|
||||
@@ -175,6 +173,57 @@ func GetPerformer(performerName string) (*models.ScrapedPerformer, error) {
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func GetPerformer(c scraperConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
|
||||
if scrapedPerformer.Name == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
performerName := *scrapedPerformer.Name
|
||||
queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs"
|
||||
res, err := http.Get(queryURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
|
||||
}
|
||||
|
||||
// Load the HTML document
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool {
|
||||
href, _ := s.Attr("href")
|
||||
if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" {
|
||||
return false
|
||||
}
|
||||
if strings.ToLower(s.Text()) == strings.ToLower(performerName) {
|
||||
return true
|
||||
}
|
||||
alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First()
|
||||
if strings.Contains(strings.ToLower(alias.Text()), strings.ToLower(performerName)) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
})
|
||||
|
||||
href, _ := performerLink.Attr("href")
|
||||
href = strings.TrimSuffix(href, "/")
|
||||
regex := regexp.MustCompile(`.+_links\/(.+)`)
|
||||
matches := regex.FindStringSubmatch(href)
|
||||
if len(matches) < 2 {
|
||||
return nil, fmt.Errorf("No matches found in %s", href)
|
||||
}
|
||||
|
||||
href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1)
|
||||
href = "https://www.freeones.com" + href
|
||||
|
||||
return getPerformerBio(c, href)
|
||||
}
|
||||
|
||||
func getIndexes(doc *goquery.Document) map[string]int {
|
||||
var indexes = make(map[string]int)
|
||||
doc.Find(".paramname").Each(func(i int, s *goquery.Selection) {
|
||||
@@ -236,7 +285,7 @@ func paramValue(params *goquery.Selection, paramIndex int) string {
|
||||
return content
|
||||
}
|
||||
node = node.NextSibling
|
||||
if (node == nil) {
|
||||
if node == nil {
|
||||
return ""
|
||||
}
|
||||
return trim(node.FirstChild.Data)
|
||||
|
||||
Reference in New Issue
Block a user