Xpath scraping from URL (#285)

* Add xpath performer and scene scraping

* Add studio scraping

* Refactor code

* Fix compile error

* Don't overwrite performer URL during a scrape
This commit is contained in:
WithoutPants
2020-01-05 03:39:33 +11:00
committed by Leopere
parent d35f3a9b10
commit 7fdaccf669
93 changed files with 174400 additions and 4 deletions

View File

@@ -19,24 +19,27 @@ type scraperAction string
const (
scraperActionScript scraperAction = "script"
scraperActionStash scraperAction = "stash"
scraperActionXPath scraperAction = "scrapeXPath"
)
var allScraperAction = []scraperAction{
scraperActionScript,
scraperActionStash,
scraperActionXPath,
}
func (e scraperAction) IsValid() bool {
switch e {
case scraperActionScript:
case scraperActionScript, scraperActionStash, scraperActionXPath:
return true
}
return false
}
type scraperTypeConfig struct {
Action scraperAction `yaml:"action"`
Script []string `yaml:"script,flow"`
Action scraperAction `yaml:"action"`
Script []string `yaml:"script,flow"`
Scraper string `yaml:"scraper"`
scraperConfig *scraperConfig
}
@@ -96,6 +99,8 @@ type scrapePerformerByURLConfig struct {
func (c *scrapePerformerByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapePerformerURLScript
} else if c.Action == scraperActionXPath {
c.performScrape = scrapePerformerURLXpath
}
}
@@ -124,6 +129,8 @@ type scrapeSceneByURLConfig struct {
func (c *scrapeSceneByURLConfig) resolveFn() {
if c.Action == scraperActionScript {
c.performScrape = scrapeSceneURLScript
} else if c.Action == scraperActionXPath {
c.performScrape = scrapeSceneURLXPath
}
}
@@ -135,7 +142,9 @@ type scraperConfig struct {
PerformerByURL []*scrapePerformerByURLConfig `yaml:"performerByURL"`
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
StashServer *stashServer `yaml:"stashServer"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}
func loadScraperFromYAML(path string) (*scraperConfig, error) {

267
pkg/scraper/xpath.go Normal file
View File

@@ -0,0 +1,267 @@
package scraper
import (
"errors"
"reflect"
"regexp"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
type commonXPathConfig map[string]string
func (c commonXPathConfig) applyCommon(src string) string {
ret := src
for commonKey, commonVal := range c {
if strings.Contains(ret, commonKey) {
ret = strings.Replace(ret, commonKey, commonVal, -1)
}
}
return ret
}
type xpathScraperConfig map[string]interface{}
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
ret := make(xpathScraperConfig)
if src != nil {
for k, v := range src {
keyStr, isStr := k.(string)
if isStr {
ret[keyStr] = v
}
}
}
return ret
}
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
var ret []xPathResult
for k, v := range s {
asStr, isStr := v.(string)
if isStr {
// apply common
if common != nil {
asStr = common.applyCommon(asStr)
}
found := htmlquery.Find(doc, asStr)
if len(found) > 0 {
for i, elem := range found {
if i >= len(ret) {
ret = append(ret, make(xPathResult))
}
ret[i][k] = elem
}
}
}
// TODO - handle map type
}
return ret
}
type xpathScrapers map[string]*xpathScraper
type xpathScraper struct {
Common commonXPathConfig `yaml:"common"`
Scene xpathScraperConfig `yaml:"scene"`
Performer xpathScraperConfig `yaml:"performer"`
}
const (
XPathScraperConfigSceneTags = "Tags"
XPathScraperConfigScenePerformers = "Performers"
XPathScraperConfigSceneStudio = "Studio"
)
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
// exclude the complex sub-configs
ret := make(xpathScraperConfig)
mapped := s.Scene
if mapped != nil {
for k, v := range mapped {
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio {
ret[k] = v
}
}
}
return ret
}
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
var ret map[interface{}]interface{}
mapped := s.Scene
if mapped != nil {
v, ok := mapped[key]
if ok {
ret, _ = v.(map[interface{}]interface{})
}
}
if ret != nil {
return createXPathScraperConfig(ret)
}
return nil
}
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
}
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneTags)
}
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
}
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
if performerMap == nil {
return nil, nil
}
results := performerMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
}
return &ret, nil
}
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneMap := s.GetSceneSimple()
if sceneMap == nil {
return nil, nil
}
scenePerformersMap := s.GetScenePerformers()
sceneTagsMap := s.GetSceneTags()
sceneStudioMap := s.GetSceneStudio()
results := sceneMap.process(doc, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if scenePerformersMap != nil {
performerResults := scenePerformersMap.process(doc, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedScenePerformer{}
p.apply(performer)
ret.Performers = append(ret.Performers, performer)
}
}
if sceneTagsMap != nil {
tagResults := sceneTagsMap.process(doc, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedSceneTag{}
p.apply(tag)
ret.Tags = append(ret.Tags, tag)
}
}
if sceneStudioMap != nil {
studioResults := sceneStudioMap.process(doc, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedSceneStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
}
return &ret, nil
}
type xPathResult map[string]*html.Node
func (r xPathResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)
// dest should be a pointer
destVal = destVal.Elem()
for key, v := range r {
field := destVal.FieldByName(key)
if field.IsValid() {
value := htmlquery.InnerText(v)
value = strings.TrimSpace(value)
// remove multiple whitespace and end lines
re := regexp.MustCompile("\n")
value = re.ReplaceAllString(value, "")
re = regexp.MustCompile(" +")
value = re.ReplaceAllString(value, " ")
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
reflectValue = reflect.ValueOf(&value)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
}
}
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
if err != nil {
return nil, err
}
return scraper.scrapePerformer(doc)
}
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}
doc, err := htmlquery.LoadURL(url)
if err != nil {
return nil, err
}
return scraper.scrapeScene(doc)
}

732
pkg/scraper/xpath_test.go Normal file
View File

@@ -0,0 +1,732 @@
package scraper
import (
"strings"
"testing"
"github.com/antchfx/htmlquery"
"github.com/stashapp/stash/pkg/models"
"gopkg.in/yaml.v2"
)
// adapted from https://www.freeones.com/html/m_links/bio_Mia_Malkova.php
const htmlDoc1 = `
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<title>Freeones: Mia Malkova Biography</title>
</head>
<body data-babe="Mia Malkova">
<div class="ContentBlock Block1">
<div class="ContentBlockBody" style="padding: 0px;">
<table id="biographyTable" border="0" cellspacing="0" cellpadding="0" width="100%">
<tbody>
<tr>
<td class="paramname">
<div><b>Babe Name:</b></div>
</td>
<td class="paramvalue">
<a href="/html/m_links/Mia_Malkova/">Mia Malkova</a>&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<div><b>Profession:</b></div>
</td>
<td class="paramvalue">Porn Star
</td>
</tr>
<tr>
<td class="paramname">
<b>Ethnicity:</b>
</td>
<td class="paramvalue">
Caucasian&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Country of Origin:</b>
</td>
<td class="paramvalue">
<span class="country-us">
United States
<span>
</span></span></td>
</tr>
<tr>
<td class="paramname">
<b>Date of Birth:</b>
</td>
<td class="paramvalue">
July 1, 1992 (27 years old)&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Aliases:</b>
</td>
<td class="paramvalue">
Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Eye Color:</b>
</td>
<td class="paramvalue">
Hazel&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Hair Color:</b>
</td>
<td class="paramvalue">
Blonde&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Height:</b>
</td>
<td class="paramvalue">
<script type="text/javascript">
<!--
heightcm = "171";
morethenone = 'inch';
feet = heightcm / 30.48;
inches = (feet - Math.floor(feet)) * 30.48 / 2.54;
feet = Math.floor(feet);
inches = inches.toFixed(0);
if (inches > 1) {
morethenone = 'inches';
}
if (heightcm == 0) {
message = 'Unknown';
} else {
message = '171 cm - ' + feet + ' feet and ' + inches + ' ' + morethenone;
}
document.write(message);
// -->
</script>&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Measurements:</b>
</td>
<td class="paramvalue">
34C-26-36
</td>
</tr>
<tr>
<td class="paramname">
<b>Fake boobs:</b>
</td>
<td class="paramvalue">
No&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Career Start And End</b>
</td>
<td class="paramvalue">
2012 - 2019
(7 Years In The Business)
</td>
</tr>
<tr>
<td class="paramname">
<b>Tattoos:</b>
</td>
<td class="paramvalue">
None&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<b>Piercings:</b>
</td>
<td class="paramvalue">
None&nbsp;
</td>
</tr>
<tr>
<td class="paramname">
<div><b>Social Network Links:</b></div>
</td>
<td class="paramvalue">
<ul id="socialmedia">
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
<li class="facebook"><a href="https://www.facebook.com/MiaMalcove" target="_blank" alt="Mia Malkova Facebook" title="Mia Malkova Facebook">Facebook</a></li>
<li class="youtube"><a href="https://www.youtube.com/channel/UCEPR0sZKa_ScMoyhemfB7nA" target="_blank" alt="Mia Malkova YouTube" title="Mia Malkova YouTube">YouTube</a></li>
<li class="instagram"><a href="https://www.instagram.com/mia_malkova/" target="_blank" alt="Mia Malkova Instagram" title="Mia Malkova Instagram">Instagram</a></li>
</ul>
</td>
</tr>
</tbody>
</table>
</div>
</div>
</body>
</html>
`
func makeCommonXPath(attr string) string {
return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
}
func makeXPathConfig() xpathScraperConfig {
config := make(xpathScraperConfig)
config["Name"] = makeCommonXPath("Babe Name:") + `/a`
config["Ethnicity"] = makeCommonXPath("Ethnicity:")
config["Country"] = makeCommonXPath("Country of Origin:")
config["Birthdate"] = makeCommonXPath("Date of Birth:")
config["Aliases"] = makeCommonXPath("Aliases:")
config["EyeColor"] = makeCommonXPath("Eye Color:")
config["Measurements"] = makeCommonXPath("Measurements:")
config["FakeTits"] = makeCommonXPath("Fake boobs:")
config["Height"] = makeCommonXPath("Height:")
// no colon in attribute header
config["CareerLength"] = makeCommonXPath("Career Start And End")
config["Tattoos"] = makeCommonXPath("Tattoos:")
config["Piercings"] = makeCommonXPath("Piercings:")
return config
}
func verifyField(t *testing.T, expected string, actual *string, field string) {
t.Helper()
if actual == nil || *actual != expected {
if actual == nil {
t.Errorf("Expected %s to be set to %s, instead got nil", field, expected)
} else {
t.Errorf("Expected %s to be set to %s, instead got %s", field, expected, *actual)
}
}
}
func TestScrapePerformerXPath(t *testing.T) {
reader := strings.NewReader(htmlDoc1)
doc, err := htmlquery.Parse(reader)
if err != nil {
t.Errorf("Error loading document: %s", err.Error())
return
}
xpathConfig := makeXPathConfig()
scraper := xpathScraper{
Performer: xpathConfig,
}
performer, err := scraper.scrapePerformer(doc)
if err != nil {
t.Errorf("Error scraping performer: %s", err.Error())
return
}
const performerName = "Mia Malkova"
const ethnicity = "Caucasian"
const country = "United States"
const birthdate = "July 1, 1992 (27 years old)"
const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica"
const eyeColor = "Hazel"
const measurements = "34C-26-36"
const fakeTits = "No"
const careerLength = "2012 - 2019"
const tattoosPiercings = "None"
verifyField(t, performerName, performer.Name, "Name")
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
verifyField(t, country, performer.Country, "Country")
verifyField(t, birthdate, performer.Birthdate, "Birthdate")
verifyField(t, aliases, performer.Aliases, "Aliases")
verifyField(t, eyeColor, performer.EyeColor, "EyeColor")
verifyField(t, measurements, performer.Measurements, "Measurements")
verifyField(t, fakeTits, performer.FakeTits, "FakeTits")
// TODO - this needs post-processing
//verifyField(t, careerLength, performer.CareerLength, "CareerLength")
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
}
const sceneHTML = `
<!DOCTYPE html>
<head>
<title>Test Video - Pornhub.com</title>
<meta property="og:title" content="Test Video" />
<meta property="og:description"
content="Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you&#039;re craving 3some XXX movies you&#039;ll find them here." />
<meta property="og:image"
content="https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg" />
<script type="application/ld+json">
{
"@context": "http://schema.org/",
"@type": "VideoObject",
"name": "Test Video",
"embedUrl": "https://www.pornhub.com/embed/ph5da270596459c",
"duration": "PT00H33M27S",
"thumbnailUrl": "https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg",
"uploadDate": "2019-10-13T00:33:51+00:00",
"description": "Watch Test Video on Pornhub&period;com&comma; the best hardcore porn site&period; Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars&period; If you&apos;re craving 3some XXX movies you&apos;ll find them here&period;",
"author" : "Mia Malkova", "interactionStatistic": [
{
"@type": "InteractionCounter",
"interactionType": "http://schema.org/WatchAction",
"userInteractionCount": "5,908,861"
},
{
"@type": "InteractionCounter",
"interactionType": "http://schema.org/LikeAction",
"userInteractionCount": "22,090"
}
]
}
</script>
</head>
<body class="logged-out">
<div class="container ">
<div id="main-container" class="clearfix" data-delete-check="1" data-is-private="1" data-is-premium=""
data-liu="0" data-next-shuffle="ph5da270596459c" data-pkey="" data-platform-pc="1" data-playlist-check="0"
data-playlist-id-check="0" data-playlist-geo-check="0" data-friend="0" data-playlist-user-check="0"
data-playlist-video-check="0" data-playlist-shuffle="0" data-shuffle-forward="ph5da270596459c"
data-shuffle-back="ph5da270596459c" data-min-large="1350"
data-video-title="Test Video">
<div id="vpContentContainer">
<div id="hd-leftColVideoPage">
<div class="video-wrapper">
<div class="title-container">
<i class="isMe tooltipTrig" data-title="Video of verified member"></i>
<h1 class="title">
<span class="inlineFree">Test Video</span>
</h1>
</div>
<div class="video-actions-container">
<div class="video-actions-tabs">
<div class="video-action-tab about-tab active">
<div class="video-detailed-info">
<div class="video-info-row">
From:&nbsp;
<div class="usernameWrap clearfix" data-type="channel" data-userid="492538092"
data-liu-user="0"
data-json-url="/user/box?id=492538092&amp;token=MTU3NzA1NTkzNIqATol8v_WrhmNTXkeflvG09C2U7UUT_NyoZUFa7iKq0mlzBkmdgAH1aNHZkJmIOHbbwmho1BehHDoA63K5Wn4."
data-disable-popover="0">
<a rel="" href="/channels/sis-loves-me" class="bolded">Sis Loves Me</a>
<div class="avatarPosition"></div>
</div>
<span class="verified-icon flag tooltipTrig"
data-title="Verified member"></span>
- 87 videos
<span class="subscribers-count">&nbsp;459466</span>
</div>
<div class="video-info-row">
<div class="pornstarsWrapper">
Pornstars:&nbsp;
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Alex D" data-id="251341" data-login="1"
href="/pornstar/alex-d">Alex D <span
class="psbox-link-container display-none"></span>
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Mia Malkova" data-id="10641" data-login="1"
href="/pornstar/mia-malkova">Mia Malkova <span
class="psbox-link-container display-none"></span>
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Riley Reid" data-id="5343" data-login="1"
href="/pornstar/riley-reid">Riley Reid <span
class="psbox-link-container display-none"></span>
</a>
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
<a class="add-btn-small add-pornstar-btn-2">+
<span>Suggest</span></a>
</div>
<div id="deletePornstarResult" class="suggest-result"></div>
</div>
</div>
<div class="video-info-row showLess">
<div class="categoriesWrapper">
Categories:&nbsp;
<a href="/video?c=3"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Amateur</a>,
<a href="/categories/babe"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Babe</a>,
<a href="/video?c=13"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Blowjob</a>,
<a href="/video?c=115"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Exclusive</a>,
<a href="/hd"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">HD
Porn</a>, <a href="/categories/pornstar"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Pornstar</a>,
<a href="/video?c=24"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Public</a>,
<a href="/video?c=131"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Pussy
Licking</a>, <a href="/video?c=65"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Threesome</a>,
<a href="/video?c=139"
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Verified
Models</a>
<div class="tooltipTrig suggestBtn" data-title="Suggest Categories">
<a id="categoryLink" class="add-btn-small ">+
<span>Suggest</span></a>
</div>
</div>
</div>
<div class="video-info-row showLess">
<div class="productionWrapper">
Production:&nbsp;
<a href="/video?p=professional" rel="nofollow"
class="production">professional</a>
</div>
</div>
<div class="video-info-row showLess">
<div class="tagsWrapper">
Tags:&nbsp;
<a href="/video/search?search=3some">3some</a>, <a
href="/video?c=9">blonde</a>, <a href="/video?c=59">small tits</a>,
<a href="/video/search?search=butt">butt</a>, <a
href="/video/search?search=natural+tits">natural tits</a>, <a
href="/video/search?search=petite">petite</a>, <a
href="/video?c=24">public</a>, <a
href="/video/search?search=outside">outside</a>, <a
href="/video/search?search=car">car</a>, <a
href="/video/search?search=garage">garage</a>, <a
href="/video?c=65">threesome</a>, <a
href="/video/search?search=bgg">bgg</a>, <a
href="/video/search?search=girlfrien+d">girlfrien d</a>, <a
href="/video/search?search=parking">parking</a>, <a
href="/video/search?search=sex">sex</a>, <a
href="/video/search?search=gagging">gagging</a>, <a
href="/video?c=13">blowjob</a>, <a
href="/video/search?search=bj">bj</a>, <a
href="/video/search?search=double">double</a>, <a
href="/video/search?search=ass">ass</a>
<div class="tooltipTrig suggestBtn" data-title="Suggest Tags">
<a id="tagLink" class="add-btn-small">+ <span>Suggest</span></a>
</div>
</div>
</div>
<div class="video-info-row showLess">
Added on: <span class="white">2 months ago</span>
</div>
<div class="video-info-row showLess">
Featured on: <span class="white">1 month ago</span>
</div>
</div>
</div>
<div class="video-action-tab jump-to-tab">
<div class="title">Jump to your favorite action</div>
<div class="filters mainFilter float-right">
<div class="dropdownTrigger">
<div>
<span class="textFilter" id="tagSort">Sequence</span>
<span class="arrowFilters"></span>
</div>
<ul class="filterListItem dropdownWrapper">
<li class="active"><a class="actionTagSort"
data-sort="seconds">Sequence</a></li>
<li><a class="actionTagSort" data-sort="tag">Alphabetical</a></li>
</ul>
</div>
</div>
<div class="reset"></div>
<div class="display-grid col-4 gap-row-none sortBy seconds">
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
Blowjob </a>
&nbsp;
<var>14:22</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
Reverse Cowgirl </a>
&nbsp;
<var>18:37</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>19:42</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>27:05</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
Doggystyle </a>
&nbsp;
<var>30:22</var>
</li>
</ul>
</div>
<div class="display-grid col-4 gap-row-none sortBy tag">
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
Blowjob </a>
&nbsp;
<var>14:22</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
Reverse Cowgirl </a>
&nbsp;
<var>18:37</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>19:42</var>
</li>
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
Cowgirl </a>
&nbsp;
<var>27:05</var>
</li>
</ul>
<ul class="actionTagList full-width margin-none">
<li>
<a class="js-triggerJumpCat"
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
Doggystyle </a>
&nbsp;
<var>30:22</var>
</li>
</ul>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>`
func makeSceneXPathConfig() xpathScraper {
common := make(commonXPathConfig)
common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]`
common["$studioElem"] = `//div[@data-type="channel"]/a`
config := make(xpathScraperConfig)
config["Title"] = `//meta[@property="og:title"]/@content`
// this needs post-processing
config["Date"] = `//script[@type="application/ld+json"]`
tagConfig := make(map[interface{}]interface{})
tagConfig["Name"] = `//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`
config["Tags"] = tagConfig
performerConfig := make(map[interface{}]interface{})
performerConfig["Name"] = `$performerElem/@data-mxptext`
performerConfig["URL"] = `$performerElem/@href`
config["Performers"] = performerConfig
studioConfig := make(map[interface{}]interface{})
studioConfig["Name"] = `$studioElem`
studioConfig["URL"] = `$studioElem/@href`
config["Studio"] = studioConfig
scraper := xpathScraper{
Scene: config,
Common: common,
}
return scraper
}
func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.ScrapedSceneTag) {
t.Helper()
i := 0
for i < len(expectedTagNames) || i < len(actualTags) {
expectedTag := ""
actualTag := ""
if i < len(expectedTagNames) {
expectedTag = expectedTagNames[i]
}
if i < len(actualTags) {
actualTag = actualTags[i].Name
}
if expectedTag != actualTag {
t.Errorf("Expected tag %s, got %s", expectedTag, actualTag)
}
i++
}
}
func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) {
t.Helper()
i := 0
for i < len(expectedNames) || i < len(actualPerformers) {
expectedName := ""
actualName := ""
expectedURL := ""
actualURL := ""
if i < len(expectedNames) {
expectedName = expectedNames[i]
}
if i < len(expectedURLs) {
expectedURL = expectedURLs[i]
}
if i < len(actualPerformers) {
actualName = actualPerformers[i].Name
if actualPerformers[i].URL != nil {
actualURL = *actualPerformers[i].URL
}
}
if expectedName != actualName {
t.Errorf("Expected performer name %s, got %s", expectedName, actualName)
}
if expectedURL != actualURL {
t.Errorf("Expected perfromer URL %s, got %s", expectedName, actualName)
}
i++
}
}
func TestApplySceneXPathConfig(t *testing.T) {
reader := strings.NewReader(sceneHTML)
doc, err := htmlquery.Parse(reader)
if err != nil {
t.Errorf("Error loading document: %s", err.Error())
return
}
scraper := makeSceneXPathConfig()
scene, err := scraper.scrapeScene(doc)
if err != nil {
t.Errorf("Error scraping scene: %s", err.Error())
return
}
const title = "Test Video"
verifyField(t, title, scene.Title, "Title")
// verify tags
expectedTags := []string{
"Amateur",
"Babe",
"Blowjob",
"Exclusive",
"HD Porn",
"Pornstar",
"Public",
"Pussy Licking",
"Threesome",
"Verified Models",
}
verifyTags(t, expectedTags, scene.Tags)
expectedPerformerNames := []string{
"Alex D",
"Mia Malkova",
"Riley Reid",
}
expectedPerformerURLs := []string{
"/pornstar/alex-d",
"/pornstar/mia-malkova",
"/pornstar/riley-reid",
}
verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, scene.Performers)
const expectedStudioName = "Sis Loves Me"
const expectedStudioURL = "/channels/sis-loves-me"
verifyField(t, expectedStudioName, &scene.Studio.Name, "Studio.Name")
verifyField(t, expectedStudioURL, scene.Studio.URL, "Studio.URL")
}
func TestLoadXPathScraperFromYAML(t *testing.T) {
const yamlStr = `name: Test
performerByURL:
- action: scrapeXPath
url:
- test.com
scraper: performerScraper
xPathScrapers:
performerScraper:
performer:
name: //h1[@itemprop="name"]
`
config := &scraperConfig{}
err := yaml.Unmarshal([]byte(yamlStr), &config)
if err != nil {
t.Errorf("Error loading yaml: %s", err.Error())
return
}
}