diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index 4e1a84557..07e916d16 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -1,6 +1,7 @@ package scraper import ( + "io" "os" "path/filepath" "strings" @@ -157,17 +158,27 @@ type scraperConfig struct { XPathScrapers xpathScrapers `yaml:"xPathScrapers"` } -func loadScraperFromYAML(path string) (*scraperConfig, error) { +func loadScraperFromYAML(id string, reader io.Reader) (*scraperConfig, error) { ret := &scraperConfig{} - file, err := os.Open(path) - defer file.Close() + parser := yaml.NewDecoder(reader) + parser.SetStrict(true) + err := parser.Decode(&ret) if err != nil { return nil, err } - parser := yaml.NewDecoder(file) - parser.SetStrict(true) - err = parser.Decode(&ret) + + ret.ID = id + + // set the scraper interface + ret.initialiseConfigs() + + return ret, nil +} + +func loadScraperFromYAMLFile(path string) (*scraperConfig, error) { + file, err := os.Open(path) + defer file.Close() if err != nil { return nil, err } @@ -175,12 +186,8 @@ func loadScraperFromYAML(path string) (*scraperConfig, error) { // set id to the filename id := filepath.Base(path) id = id[:strings.LastIndex(id, ".")] - ret.ID = id - // set the scraper interface - ret.initialiseConfigs() - - return ret, nil + return loadScraperFromYAML(id, file) } func (c *scraperConfig) initialiseConfigs() { diff --git a/pkg/scraper/freeones.go b/pkg/scraper/freeones.go index 8345e8113..1afd39226 100644 --- a/pkg/scraper/freeones.go +++ b/pkg/scraper/freeones.go @@ -1,318 +1,102 @@ package scraper import ( - "fmt" - "net/http" - "net/url" - "regexp" "strings" - "time" - "github.com/PuerkitoBio/goquery" - "github.com/stashapp/stash/pkg/models" + "github.com/stashapp/stash/pkg/logger" ) -const freeonesTimeout = 45 * time.Second - const freeonesScraperID = "builtin_freeones" -const freeonesName = "Freeones" -var freeonesURLs = []string{ - "freeones.com", -} +// 537: stolen from: https://github.com/stashapp/CommunityScrapers/blob/master/scrapers/NewFreeones.yml +const freeonesScraperConfig = ` +name: Freeones +performerByName: + action: scrapeXPath + queryURL: https://www.freeones.xxx/babes?q={}&v=teasers&s=relevance&l=96&m%5BcanPreviewFeatures%5D=0 + scraper: performerSearch +performerByURL: + - action: scrapeXPath + url: + - https://www.freeones.xxx + scraper: performerScraper + +xPathScrapers: + performerSearch: + performer: + Name: //div[@id="search-result"]//a[@class=""]//div//p/text() + URL: + selector: //div[@id="search-result"]//a[@class=""]/@href + # URL is a partial url, add the first part + replace: + - regex: ^ + with: https://www.freeones.xxx + - regex: $ + with: /profile + + performerScraper: + performer: + Name: //h1 + URL: + selector: //a[span[text()="Profile"]]/@href + # URL is a partial url, add the first part + replace: + - regex: ^ + with: https://www.freeones.xxx + Twitter: //div[p[text()='Follow On']]//div//a[@class='d-flex align-items-center justify-content-center mr-2 social-icons color-twitter']/@href + Instagram: //div[p[text()='Follow On']]//div//a[@class='d-flex align-items-center justify-content-center mr-2 social-icons color-telegram']/@href + # need to add support for concatenating two elements or something + Birthdate: + selector: //div[p[text()='Personal Information']]//div//p[1]//a + replace: + - regex: Born On + with: + - regex: "," + with: + # reference date is: 2006/01/02 + parseDate: January 2 2006 + Ethnicity: + selector: //div[p[text()='Ethnicity']]//div//p[@class='mb-0 text-center'] + replace: + - regex: Asian + with: "asian" + - regex: Caucasian + with: "white" + - regex: Black + with: "black" + - regex: Latin + with: "hispanic" + Country: //div[p[text()='Personal Information']]//div//p[3]//a[last()] + EyeColor: //div[p[text()='Eye Color']]//div//p//a//span + Height: + selector: //div[p[text()='Height']]//div//p//a//span + replace: + - regex: \D+[\s\S]+ + with: "" + Measurements: //div[p[text()='Measurements']]//div[@class='p-3']//p + FakeTits: //div[p[text()='Fake Boobs']]//div[@class='p-3']//p + # nbsp; screws up the parsing, so use contains instead + CareerLength: + selector: //div[p[text()='career']]//div//div[@class='timeline-horizontal mb-3']//div//p[@class='m-0'] + concat: "-" + replace: + - regex: -\w+-\w+-\w+-\w+-\w+$ + with: "" + Aliases: //div[p[text()='Aliases']]//div//p[@class='mb-0 text-center'] + Tattoos: //div[p[text()='Tattoos']]//div//p[@class='mb-0 text-center'] + Piercings: //div[p[text()='Piercings']]//div//p[@class='mb-0 text-center'] + Image: + selector: //div[@class='profile-image-large']//a/img/@src + # URL is a partial url, add the first part +` func GetFreeonesScraper() scraperConfig { - return scraperConfig{ - ID: freeonesScraperID, - Name: "Freeones", - PerformerByName: &performerByNameConfig{ - performScrape: GetPerformerNames, - }, - PerformerByFragment: &performerByFragmentConfig{ - performScrape: GetPerformer, - }, - PerformerByURL: []*scrapePerformerByURLConfig{ - &scrapePerformerByURLConfig{ - scrapeByURLConfig: scrapeByURLConfig{ - URL: freeonesURLs, - }, - performScrape: GetPerformerURL, - }, - }, - } -} + yml := freeonesScraperConfig -func GetPerformerNames(c scraperTypeConfig, q string) ([]*models.ScrapedPerformer, error) { - // Request the HTML page. - queryURL := "https://www.freeones.com/suggestions.php?q=" + url.PathEscape(q) + "&t=1" - client := http.Client{ - Timeout: freeonesTimeout, - } - res, err := client.Get(queryURL) + scraper, err := loadScraperFromYAML(freeonesScraperID, strings.NewReader(yml)) if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode != 200 { - return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) + logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error()) } - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - return nil, err - } - - // Find the performers - var performers []*models.ScrapedPerformer - doc.Find(".suggestion").Each(func(i int, s *goquery.Selection) { - name := strings.Trim(s.Text(), " ") - p := models.ScrapedPerformer{ - Name: &name, - } - performers = append(performers, &p) - }) - - return performers, nil -} - -func GetPerformerURL(c scraperTypeConfig, href string) (*models.ScrapedPerformer, error) { - // if we're already in the bio page, just scrape it - reg := regexp.MustCompile(`\/bio_.*\.php$`) - if reg.MatchString(href) { - return getPerformerBio(c, href) - } - - // otherwise try to get the bio page from the url - profileRE := regexp.MustCompile(`_links\/(.*?)\/$`) - if profileRE.MatchString(href) { - href = profileRE.ReplaceAllString(href, "_links/bio_$1.php") - return getPerformerBio(c, href) - } - - return nil, fmt.Errorf("Bio page not found in %s", href) -} - -func getPerformerBio(c scraperTypeConfig, href string) (*models.ScrapedPerformer, error) { - client := http.Client{ - Timeout: freeonesTimeout, - } - - bioRes, err := client.Get(href) - if err != nil { - return nil, err - } - defer bioRes.Body.Close() - if bioRes.StatusCode != 200 { - return nil, fmt.Errorf("status code error: %d %s", bioRes.StatusCode, bioRes.Status) - } - - // Load the HTML document - bioDoc, err := goquery.NewDocumentFromReader(bioRes.Body) - if err != nil { - return nil, err - } - - params := bioDoc.Find(".paramvalue") - paramIndexes := getIndexes(bioDoc) - - result := models.ScrapedPerformer{} - - performerURL := bioRes.Request.URL.String() - result.URL = &performerURL - - name := paramValue(params, paramIndexes["name"]) - result.Name = &name - - ethnicity := getEthnicity(paramValue(params, paramIndexes["ethnicity"])) - result.Ethnicity = ðnicity - - country := paramValue(params, paramIndexes["country"]) - result.Country = &country - - eyeColor := paramValue(params, paramIndexes["eye_color"]) - result.EyeColor = &eyeColor - - measurements := paramValue(params, paramIndexes["measurements"]) - result.Measurements = &measurements - - fakeTits := paramValue(params, paramIndexes["fake_tits"]) - result.FakeTits = &fakeTits - - careerLength := paramValue(params, paramIndexes["career_length"]) - careerRegex := regexp.MustCompile(`\([\s\S]*`) - careerLength = careerRegex.ReplaceAllString(careerLength, "") - careerLength = trim(careerLength) - result.CareerLength = &careerLength - - tattoos := paramValue(params, paramIndexes["tattoos"]) - result.Tattoos = &tattoos - - piercings := paramValue(params, paramIndexes["piercings"]) - result.Piercings = &piercings - - aliases := paramValue(params, paramIndexes["aliases"]) - result.Aliases = &aliases - - birthdate := paramValue(params, paramIndexes["birthdate"]) - birthdateRegex := regexp.MustCompile(` \(\d* years old\)`) - birthdate = birthdateRegex.ReplaceAllString(birthdate, "") - birthdate = trim(birthdate) - if birthdate != "Unknown" && len(birthdate) > 0 { - t, _ := time.Parse("January _2, 2006", birthdate) // TODO - formattedBirthdate := t.Format("2006-01-02") - result.Birthdate = &formattedBirthdate - } - - height := paramValue(params, paramIndexes["height"]) - heightRegex := regexp.MustCompile(`heightcm = "(.*)"\;`) - heightMatches := heightRegex.FindStringSubmatch(height) - if len(heightMatches) > 1 { - result.Height = &heightMatches[1] - } - - twitterElement := bioDoc.Find(".twitter a") - twitterHref, _ := twitterElement.Attr("href") - if twitterHref != "" { - twitterURL, _ := url.Parse(twitterHref) - twitterHandle := strings.Replace(twitterURL.Path, "/", "", -1) - result.Twitter = &twitterHandle - } - - instaElement := bioDoc.Find(".instagram a") - instaHref, _ := instaElement.Attr("href") - if instaHref != "" { - instaURL, _ := url.Parse(instaHref) - instaHandle := strings.Replace(instaURL.Path, "/", "", -1) - result.Instagram = &instaHandle - } - - return &result, nil -} - -func GetPerformer(c scraperTypeConfig, scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) { - if scrapedPerformer.Name == nil { - return nil, nil - } - - performerName := *scrapedPerformer.Name - queryURL := "https://www.freeones.com/search/?t=1&q=" + url.PathEscape(performerName) + "&view=thumbs" - res, err := http.Get(queryURL) - if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode != 200 { - return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) - } - - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - return nil, err - } - - performerLink := doc.Find("div.Block3 a").FilterFunction(func(i int, s *goquery.Selection) bool { - href, _ := s.Attr("href") - if href == "/html/j_links/Jenna_Leigh_c/" || href == "/html/a_links/Alexa_Grace_c/" { - return false - } - if strings.ToLower(s.Text()) == strings.ToLower(performerName) { - return true - } - alias := s.ParentsFiltered(".babeNameBlock").Find(".babeAlias").First() - if strings.Contains(strings.ToLower(alias.Text()), strings.ToLower(performerName)) { - return true - } - return false - }) - - href, _ := performerLink.Attr("href") - href = strings.TrimSuffix(href, "/") - regex := regexp.MustCompile(`.+_links\/(.+)`) - matches := regex.FindStringSubmatch(href) - if len(matches) < 2 { - return nil, fmt.Errorf("No matches found in %s", href) - } - - href = strings.Replace(href, matches[1], "bio_"+matches[1]+".php", -1) - href = "https://www.freeones.com" + href - - return getPerformerBio(c, href) - -} - -func getIndexes(doc *goquery.Document) map[string]int { - var indexes = make(map[string]int) - doc.Find(".paramname").Each(func(i int, s *goquery.Selection) { - index := i + 1 - paramName := trim(s.Text()) - switch paramName { - case "Babe Name:": - indexes["name"] = index - case "Ethnicity:": - indexes["ethnicity"] = index - case "Country of Origin:": - indexes["country"] = index - case "Date of Birth:": - indexes["birthdate"] = index - case "Eye Color:": - indexes["eye_color"] = index - case "Height:": - indexes["height"] = index - case "Measurements:": - indexes["measurements"] = index - case "Fake boobs:": - indexes["fake_tits"] = index - case "Career Start And End": - indexes["career_length"] = index - case "Tattoos:": - indexes["tattoos"] = index - case "Piercings:": - indexes["piercings"] = index - case "Aliases:": - indexes["aliases"] = index - } - }) - return indexes -} - -func getEthnicity(ethnicity string) string { - switch ethnicity { - case "Caucasian": - return "white" - case "Black": - return "black" - case "Latin": - return "hispanic" - case "Asian": - return "asian" - default: - // #367 - unknown ethnicity shouldn't cause the entire operation to - // fail. Just return the original string instead - return ethnicity - } -} - -func paramValue(params *goquery.Selection, paramIndex int) string { - i := paramIndex - 1 - if paramIndex <= 0 { - return "" - } - node := params.Get(i).FirstChild - content := trim(node.Data) - if content != "" { - return content - } - node = node.NextSibling - if node == nil { - return "" - } - return trim(node.FirstChild.Data) -} - -// https://stackoverflow.com/questions/20305966/why-does-strip-not-remove-the-leading-whitespace -func trim(text string) string { - // return text.replace(/\A\p{Space}*|\p{Space}*\z/, ""); - return strings.TrimSpace(text) + return *scraper } diff --git a/pkg/scraper/scrapers.go b/pkg/scraper/scrapers.go index 1eff1a5a8..14025c3f2 100644 --- a/pkg/scraper/scrapers.go +++ b/pkg/scraper/scrapers.go @@ -32,7 +32,7 @@ func loadScrapers() ([]scraperConfig, error) { scrapers = append(scrapers, GetFreeonesScraper()) for _, file := range scraperFiles { - scraper, err := loadScraperFromYAML(file) + scraper, err := loadScraperFromYAMLFile(file) if err != nil { logger.Errorf("Error loading scraper %s: %s", file, err.Error()) } else { @@ -190,7 +190,7 @@ func matchMovie(m *models.ScrapedSceneMovie) error { return err } - if len(movies) !=1 { + if len(movies) != 1 { // ignore - cannot match return nil }