mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Xpath scraping from URL (#285)
* Add xpath performer and scene scraping * Add studio scraping * Refactor code * Fix compile error * Don't overwrite performer URL during a scrape
This commit is contained in:
@@ -19,24 +19,27 @@ type scraperAction string
|
||||
const (
|
||||
scraperActionScript scraperAction = "script"
|
||||
scraperActionStash scraperAction = "stash"
|
||||
scraperActionXPath scraperAction = "scrapeXPath"
|
||||
)
|
||||
|
||||
var allScraperAction = []scraperAction{
|
||||
scraperActionScript,
|
||||
scraperActionStash,
|
||||
scraperActionXPath,
|
||||
}
|
||||
|
||||
func (e scraperAction) IsValid() bool {
|
||||
switch e {
|
||||
case scraperActionScript:
|
||||
case scraperActionScript, scraperActionStash, scraperActionXPath:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type scraperTypeConfig struct {
|
||||
Action scraperAction `yaml:"action"`
|
||||
Script []string `yaml:"script,flow"`
|
||||
Action scraperAction `yaml:"action"`
|
||||
Script []string `yaml:"script,flow"`
|
||||
Scraper string `yaml:"scraper"`
|
||||
|
||||
scraperConfig *scraperConfig
|
||||
}
|
||||
@@ -96,6 +99,8 @@ type scrapePerformerByURLConfig struct {
|
||||
func (c *scrapePerformerByURLConfig) resolveFn() {
|
||||
if c.Action == scraperActionScript {
|
||||
c.performScrape = scrapePerformerURLScript
|
||||
} else if c.Action == scraperActionXPath {
|
||||
c.performScrape = scrapePerformerURLXpath
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,6 +129,8 @@ type scrapeSceneByURLConfig struct {
|
||||
func (c *scrapeSceneByURLConfig) resolveFn() {
|
||||
if c.Action == scraperActionScript {
|
||||
c.performScrape = scrapeSceneURLScript
|
||||
} else if c.Action == scraperActionXPath {
|
||||
c.performScrape = scrapeSceneURLXPath
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +142,9 @@ type scraperConfig struct {
|
||||
PerformerByURL []*scrapePerformerByURLConfig `yaml:"performerByURL"`
|
||||
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
|
||||
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`
|
||||
StashServer *stashServer `yaml:"stashServer"`
|
||||
|
||||
StashServer *stashServer `yaml:"stashServer"`
|
||||
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
|
||||
}
|
||||
|
||||
func loadScraperFromYAML(path string) (*scraperConfig, error) {
|
||||
|
||||
267
pkg/scraper/xpath.go
Normal file
267
pkg/scraper/xpath.go
Normal file
@@ -0,0 +1,267 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/antchfx/htmlquery"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
)
|
||||
|
||||
type commonXPathConfig map[string]string
|
||||
|
||||
func (c commonXPathConfig) applyCommon(src string) string {
|
||||
ret := src
|
||||
for commonKey, commonVal := range c {
|
||||
if strings.Contains(ret, commonKey) {
|
||||
ret = strings.Replace(ret, commonKey, commonVal, -1)
|
||||
}
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
type xpathScraperConfig map[string]interface{}
|
||||
|
||||
func createXPathScraperConfig(src map[interface{}]interface{}) xpathScraperConfig {
|
||||
ret := make(xpathScraperConfig)
|
||||
|
||||
if src != nil {
|
||||
for k, v := range src {
|
||||
keyStr, isStr := k.(string)
|
||||
if isStr {
|
||||
ret[keyStr] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) []xPathResult {
|
||||
var ret []xPathResult
|
||||
|
||||
for k, v := range s {
|
||||
asStr, isStr := v.(string)
|
||||
|
||||
if isStr {
|
||||
// apply common
|
||||
if common != nil {
|
||||
asStr = common.applyCommon(asStr)
|
||||
}
|
||||
|
||||
found := htmlquery.Find(doc, asStr)
|
||||
if len(found) > 0 {
|
||||
for i, elem := range found {
|
||||
if i >= len(ret) {
|
||||
ret = append(ret, make(xPathResult))
|
||||
}
|
||||
|
||||
ret[i][k] = elem
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO - handle map type
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
type xpathScrapers map[string]*xpathScraper
|
||||
|
||||
type xpathScraper struct {
|
||||
Common commonXPathConfig `yaml:"common"`
|
||||
Scene xpathScraperConfig `yaml:"scene"`
|
||||
Performer xpathScraperConfig `yaml:"performer"`
|
||||
}
|
||||
|
||||
const (
|
||||
XPathScraperConfigSceneTags = "Tags"
|
||||
XPathScraperConfigScenePerformers = "Performers"
|
||||
XPathScraperConfigSceneStudio = "Studio"
|
||||
)
|
||||
|
||||
func (s xpathScraper) GetSceneSimple() xpathScraperConfig {
|
||||
// exclude the complex sub-configs
|
||||
ret := make(xpathScraperConfig)
|
||||
mapped := s.Scene
|
||||
|
||||
if mapped != nil {
|
||||
for k, v := range mapped {
|
||||
if k != XPathScraperConfigSceneTags && k != XPathScraperConfigScenePerformers && k != XPathScraperConfigSceneStudio {
|
||||
ret[k] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
func (s xpathScraper) getSceneSubMap(key string) xpathScraperConfig {
|
||||
var ret map[interface{}]interface{}
|
||||
mapped := s.Scene
|
||||
|
||||
if mapped != nil {
|
||||
v, ok := mapped[key]
|
||||
if ok {
|
||||
ret, _ = v.(map[interface{}]interface{})
|
||||
}
|
||||
}
|
||||
|
||||
if ret != nil {
|
||||
return createXPathScraperConfig(ret)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s xpathScraper) GetScenePerformers() xpathScraperConfig {
|
||||
return s.getSceneSubMap(XPathScraperConfigScenePerformers)
|
||||
}
|
||||
|
||||
func (s xpathScraper) GetSceneTags() xpathScraperConfig {
|
||||
return s.getSceneSubMap(XPathScraperConfigSceneTags)
|
||||
}
|
||||
|
||||
func (s xpathScraper) GetSceneStudio() xpathScraperConfig {
|
||||
return s.getSceneSubMap(XPathScraperConfigSceneStudio)
|
||||
}
|
||||
|
||||
func (s xpathScraper) scrapePerformer(doc *html.Node) (*models.ScrapedPerformer, error) {
|
||||
var ret models.ScrapedPerformer
|
||||
|
||||
performerMap := s.Performer
|
||||
if performerMap == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
results := performerMap.process(doc, s.Common)
|
||||
if len(results) > 0 {
|
||||
results[0].apply(&ret)
|
||||
}
|
||||
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
func (s xpathScraper) scrapeScene(doc *html.Node) (*models.ScrapedScene, error) {
|
||||
var ret models.ScrapedScene
|
||||
|
||||
sceneMap := s.GetSceneSimple()
|
||||
if sceneMap == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
scenePerformersMap := s.GetScenePerformers()
|
||||
sceneTagsMap := s.GetSceneTags()
|
||||
sceneStudioMap := s.GetSceneStudio()
|
||||
|
||||
results := sceneMap.process(doc, s.Common)
|
||||
if len(results) > 0 {
|
||||
results[0].apply(&ret)
|
||||
|
||||
// now apply the performers and tags
|
||||
if scenePerformersMap != nil {
|
||||
performerResults := scenePerformersMap.process(doc, s.Common)
|
||||
|
||||
for _, p := range performerResults {
|
||||
performer := &models.ScrapedScenePerformer{}
|
||||
p.apply(performer)
|
||||
ret.Performers = append(ret.Performers, performer)
|
||||
}
|
||||
}
|
||||
|
||||
if sceneTagsMap != nil {
|
||||
tagResults := sceneTagsMap.process(doc, s.Common)
|
||||
|
||||
for _, p := range tagResults {
|
||||
tag := &models.ScrapedSceneTag{}
|
||||
p.apply(tag)
|
||||
ret.Tags = append(ret.Tags, tag)
|
||||
}
|
||||
}
|
||||
|
||||
if sceneStudioMap != nil {
|
||||
studioResults := sceneStudioMap.process(doc, s.Common)
|
||||
|
||||
if len(studioResults) > 0 {
|
||||
studio := &models.ScrapedSceneStudio{}
|
||||
studioResults[0].apply(studio)
|
||||
ret.Studio = studio
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
type xPathResult map[string]*html.Node
|
||||
|
||||
func (r xPathResult) apply(dest interface{}) {
|
||||
destVal := reflect.ValueOf(dest)
|
||||
|
||||
// dest should be a pointer
|
||||
destVal = destVal.Elem()
|
||||
|
||||
for key, v := range r {
|
||||
field := destVal.FieldByName(key)
|
||||
|
||||
if field.IsValid() {
|
||||
value := htmlquery.InnerText(v)
|
||||
value = strings.TrimSpace(value)
|
||||
|
||||
// remove multiple whitespace and end lines
|
||||
re := regexp.MustCompile("\n")
|
||||
value = re.ReplaceAllString(value, "")
|
||||
re = regexp.MustCompile(" +")
|
||||
value = re.ReplaceAllString(value, " ")
|
||||
|
||||
var reflectValue reflect.Value
|
||||
if field.Kind() == reflect.Ptr {
|
||||
reflectValue = reflect.ValueOf(&value)
|
||||
} else {
|
||||
reflectValue = reflect.ValueOf(value)
|
||||
}
|
||||
|
||||
field.Set(reflectValue)
|
||||
} else {
|
||||
logger.Errorf("Field %s does not exist in %T", key, dest)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
|
||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := htmlquery.LoadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scraper.scrapePerformer(doc)
|
||||
}
|
||||
|
||||
func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene, error) {
|
||||
scraper := c.scraperConfig.XPathScrapers[c.Scraper]
|
||||
|
||||
if scraper == nil {
|
||||
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
|
||||
}
|
||||
|
||||
doc, err := htmlquery.LoadURL(url)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scraper.scrapeScene(doc)
|
||||
}
|
||||
732
pkg/scraper/xpath_test.go
Normal file
732
pkg/scraper/xpath_test.go
Normal file
@@ -0,0 +1,732 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/antchfx/htmlquery"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// adapted from https://www.freeones.com/html/m_links/bio_Mia_Malkova.php
|
||||
const htmlDoc1 = `
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
|
||||
<head>
|
||||
<title>Freeones: Mia Malkova Biography</title>
|
||||
</head>
|
||||
<body data-babe="Mia Malkova">
|
||||
<div class="ContentBlock Block1">
|
||||
<div class="ContentBlockBody" style="padding: 0px;">
|
||||
<table id="biographyTable" border="0" cellspacing="0" cellpadding="0" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<div><b>Babe Name:</b></div>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
<a href="/html/m_links/Mia_Malkova/">Mia Malkova</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<div><b>Profession:</b></div>
|
||||
</td>
|
||||
<td class="paramvalue">Porn Star
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Ethnicity:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
Caucasian
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Country of Origin:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
|
||||
<span class="country-us">
|
||||
|
||||
United States
|
||||
<span>
|
||||
</span></span></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Date of Birth:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
July 1, 1992 (27 years old)
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Aliases:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Eye Color:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
Hazel
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Hair Color:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
Blonde
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Height:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
<script type="text/javascript">
|
||||
<!--
|
||||
heightcm = "171";
|
||||
morethenone = 'inch';
|
||||
feet = heightcm / 30.48;
|
||||
inches = (feet - Math.floor(feet)) * 30.48 / 2.54;
|
||||
|
||||
feet = Math.floor(feet);
|
||||
inches = inches.toFixed(0);
|
||||
|
||||
if (inches > 1) {
|
||||
morethenone = 'inches';
|
||||
}
|
||||
|
||||
if (heightcm == 0) {
|
||||
message = 'Unknown';
|
||||
} else {
|
||||
message = '171 cm - ' + feet + ' feet and ' + inches + ' ' + morethenone;
|
||||
}
|
||||
document.write(message);
|
||||
// -->
|
||||
</script>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Measurements:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
34C-26-36
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Fake boobs:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
No
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Career Start And End</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
2012 - 2019
|
||||
(7 Years In The Business)
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Tattoos:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
None
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<b>Piercings:</b>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
None
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="paramname">
|
||||
<div><b>Social Network Links:</b></div>
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
<ul id="socialmedia">
|
||||
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
|
||||
<li class="facebook"><a href="https://www.facebook.com/MiaMalcove" target="_blank" alt="Mia Malkova Facebook" title="Mia Malkova Facebook">Facebook</a></li>
|
||||
<li class="youtube"><a href="https://www.youtube.com/channel/UCEPR0sZKa_ScMoyhemfB7nA" target="_blank" alt="Mia Malkova YouTube" title="Mia Malkova YouTube">YouTube</a></li>
|
||||
<li class="instagram"><a href="https://www.instagram.com/mia_malkova/" target="_blank" alt="Mia Malkova Instagram" title="Mia Malkova Instagram">Instagram</a></li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
func makeCommonXPath(attr string) string {
|
||||
return `//table[@id="biographyTable"]//tr/td[@class="paramname"]//b[text() = '` + attr + `']/ancestor::tr/td[@class="paramvalue"]`
|
||||
}
|
||||
|
||||
func makeXPathConfig() xpathScraperConfig {
|
||||
config := make(xpathScraperConfig)
|
||||
|
||||
config["Name"] = makeCommonXPath("Babe Name:") + `/a`
|
||||
config["Ethnicity"] = makeCommonXPath("Ethnicity:")
|
||||
config["Country"] = makeCommonXPath("Country of Origin:")
|
||||
config["Birthdate"] = makeCommonXPath("Date of Birth:")
|
||||
config["Aliases"] = makeCommonXPath("Aliases:")
|
||||
config["EyeColor"] = makeCommonXPath("Eye Color:")
|
||||
config["Measurements"] = makeCommonXPath("Measurements:")
|
||||
config["FakeTits"] = makeCommonXPath("Fake boobs:")
|
||||
config["Height"] = makeCommonXPath("Height:")
|
||||
// no colon in attribute header
|
||||
config["CareerLength"] = makeCommonXPath("Career Start And End")
|
||||
config["Tattoos"] = makeCommonXPath("Tattoos:")
|
||||
config["Piercings"] = makeCommonXPath("Piercings:")
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
func verifyField(t *testing.T, expected string, actual *string, field string) {
|
||||
t.Helper()
|
||||
|
||||
if actual == nil || *actual != expected {
|
||||
if actual == nil {
|
||||
t.Errorf("Expected %s to be set to %s, instead got nil", field, expected)
|
||||
} else {
|
||||
t.Errorf("Expected %s to be set to %s, instead got %s", field, expected, *actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapePerformerXPath(t *testing.T) {
|
||||
reader := strings.NewReader(htmlDoc1)
|
||||
doc, err := htmlquery.Parse(reader)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Error loading document: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
xpathConfig := makeXPathConfig()
|
||||
|
||||
scraper := xpathScraper{
|
||||
Performer: xpathConfig,
|
||||
}
|
||||
|
||||
performer, err := scraper.scrapePerformer(doc)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Error scraping performer: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
const performerName = "Mia Malkova"
|
||||
const ethnicity = "Caucasian"
|
||||
const country = "United States"
|
||||
const birthdate = "July 1, 1992 (27 years old)"
|
||||
const aliases = "Mia Bliss, Madison Clover, Madison Swan, Mia Mountain, Jessica"
|
||||
const eyeColor = "Hazel"
|
||||
const measurements = "34C-26-36"
|
||||
const fakeTits = "No"
|
||||
const careerLength = "2012 - 2019"
|
||||
const tattoosPiercings = "None"
|
||||
|
||||
verifyField(t, performerName, performer.Name, "Name")
|
||||
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
|
||||
verifyField(t, country, performer.Country, "Country")
|
||||
verifyField(t, birthdate, performer.Birthdate, "Birthdate")
|
||||
verifyField(t, aliases, performer.Aliases, "Aliases")
|
||||
verifyField(t, eyeColor, performer.EyeColor, "EyeColor")
|
||||
verifyField(t, measurements, performer.Measurements, "Measurements")
|
||||
verifyField(t, fakeTits, performer.FakeTits, "FakeTits")
|
||||
|
||||
// TODO - this needs post-processing
|
||||
//verifyField(t, careerLength, performer.CareerLength, "CareerLength")
|
||||
|
||||
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
|
||||
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
|
||||
}
|
||||
|
||||
const sceneHTML = `
|
||||
<!DOCTYPE html>
|
||||
|
||||
<head>
|
||||
<title>Test Video - Pornhub.com</title>
|
||||
|
||||
<meta property="og:title" content="Test Video" />
|
||||
<meta property="og:description"
|
||||
content="Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you're craving 3some XXX movies you'll find them here." />
|
||||
<meta property="og:image"
|
||||
content="https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg" />
|
||||
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context": "http://schema.org/",
|
||||
"@type": "VideoObject",
|
||||
"name": "Test Video",
|
||||
"embedUrl": "https://www.pornhub.com/embed/ph5da270596459c",
|
||||
"duration": "PT00H33M27S",
|
||||
"thumbnailUrl": "https://di.phncdn.com/videos/201910/13/254476211/thumbs_80/(m=eaAaGwObaaaa)(mh=_V1YEGdMFS1rEYoW)9.jpg",
|
||||
"uploadDate": "2019-10-13T00:33:51+00:00",
|
||||
"description": "Watch Test Video on Pornhub.com, the best hardcore porn site. Pornhub is home to the widest selection of free Babe sex videos full of the hottest pornstars. If you're craving 3some XXX movies you'll find them here.",
|
||||
"author" : "Mia Malkova", "interactionStatistic": [
|
||||
{
|
||||
"@type": "InteractionCounter",
|
||||
"interactionType": "http://schema.org/WatchAction",
|
||||
"userInteractionCount": "5,908,861"
|
||||
},
|
||||
{
|
||||
"@type": "InteractionCounter",
|
||||
"interactionType": "http://schema.org/LikeAction",
|
||||
"userInteractionCount": "22,090"
|
||||
}
|
||||
]
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
|
||||
<body class="logged-out">
|
||||
<div class="container ">
|
||||
|
||||
|
||||
<div id="main-container" class="clearfix" data-delete-check="1" data-is-private="1" data-is-premium=""
|
||||
data-liu="0" data-next-shuffle="ph5da270596459c" data-pkey="" data-platform-pc="1" data-playlist-check="0"
|
||||
data-playlist-id-check="0" data-playlist-geo-check="0" data-friend="0" data-playlist-user-check="0"
|
||||
data-playlist-video-check="0" data-playlist-shuffle="0" data-shuffle-forward="ph5da270596459c"
|
||||
data-shuffle-back="ph5da270596459c" data-min-large="1350"
|
||||
data-video-title="Test Video">
|
||||
|
||||
<div id="vpContentContainer">
|
||||
<div id="hd-leftColVideoPage">
|
||||
<div class="video-wrapper">
|
||||
<div class="title-container">
|
||||
<i class="isMe tooltipTrig" data-title="Video of verified member"></i>
|
||||
<h1 class="title">
|
||||
<span class="inlineFree">Test Video</span>
|
||||
</h1>
|
||||
</div>
|
||||
|
||||
<div class="video-actions-container">
|
||||
<div class="video-actions-tabs">
|
||||
<div class="video-action-tab about-tab active">
|
||||
<div class="video-detailed-info">
|
||||
<div class="video-info-row">
|
||||
From:
|
||||
|
||||
<div class="usernameWrap clearfix" data-type="channel" data-userid="492538092"
|
||||
data-liu-user="0"
|
||||
data-json-url="/user/box?id=492538092&token=MTU3NzA1NTkzNIqATol8v_WrhmNTXkeflvG09C2U7UUT_NyoZUFa7iKq0mlzBkmdgAH1aNHZkJmIOHbbwmho1BehHDoA63K5Wn4."
|
||||
data-disable-popover="0">
|
||||
|
||||
<a rel="" href="/channels/sis-loves-me" class="bolded">Sis Loves Me</a>
|
||||
<div class="avatarPosition"></div>
|
||||
</div>
|
||||
|
||||
<span class="verified-icon flag tooltipTrig"
|
||||
data-title="Verified member"></span>
|
||||
- 87 videos
|
||||
<span class="subscribers-count"> 459466</span>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row">
|
||||
<div class="pornstarsWrapper">
|
||||
Pornstars:
|
||||
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||
data-mxptext="Alex D" data-id="251341" data-login="1"
|
||||
href="/pornstar/alex-d">Alex D <span
|
||||
class="psbox-link-container display-none"></span>
|
||||
</a>
|
||||
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||
data-mxptext="Mia Malkova" data-id="10641" data-login="1"
|
||||
href="/pornstar/mia-malkova">Mia Malkova <span
|
||||
class="psbox-link-container display-none"></span>
|
||||
</a>
|
||||
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
|
||||
data-mxptext="Riley Reid" data-id="5343" data-login="1"
|
||||
href="/pornstar/riley-reid">Riley Reid <span
|
||||
class="psbox-link-container display-none"></span>
|
||||
</a>
|
||||
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
|
||||
<a class="add-btn-small add-pornstar-btn-2">+
|
||||
<span>Suggest</span></a>
|
||||
</div>
|
||||
<div id="deletePornstarResult" class="suggest-result"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row showLess">
|
||||
<div class="categoriesWrapper">
|
||||
Categories:
|
||||
<a href="/video?c=3"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Amateur</a>,
|
||||
<a href="/categories/babe"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Babe</a>,
|
||||
<a href="/video?c=13"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Blowjob</a>,
|
||||
<a href="/video?c=115"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Exclusive</a>,
|
||||
<a href="/hd"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">HD
|
||||
Porn</a>, <a href="/categories/pornstar"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Pornstar</a>,
|
||||
<a href="/video?c=24"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Public</a>,
|
||||
<a href="/video?c=131"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Pussy
|
||||
Licking</a>, <a href="/video?c=65"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Threesome</a>,
|
||||
<a href="/video?c=139"
|
||||
onclick="ga('send', 'event', 'Watch Page', 'click', 'Category');">Verified
|
||||
Models</a>
|
||||
<div class="tooltipTrig suggestBtn" data-title="Suggest Categories">
|
||||
<a id="categoryLink" class="add-btn-small ">+
|
||||
<span>Suggest</span></a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row showLess">
|
||||
<div class="productionWrapper">
|
||||
Production:
|
||||
<a href="/video?p=professional" rel="nofollow"
|
||||
class="production">professional</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row showLess">
|
||||
<div class="tagsWrapper">
|
||||
Tags:
|
||||
<a href="/video/search?search=3some">3some</a>, <a
|
||||
href="/video?c=9">blonde</a>, <a href="/video?c=59">small tits</a>,
|
||||
<a href="/video/search?search=butt">butt</a>, <a
|
||||
href="/video/search?search=natural+tits">natural tits</a>, <a
|
||||
href="/video/search?search=petite">petite</a>, <a
|
||||
href="/video?c=24">public</a>, <a
|
||||
href="/video/search?search=outside">outside</a>, <a
|
||||
href="/video/search?search=car">car</a>, <a
|
||||
href="/video/search?search=garage">garage</a>, <a
|
||||
href="/video?c=65">threesome</a>, <a
|
||||
href="/video/search?search=bgg">bgg</a>, <a
|
||||
href="/video/search?search=girlfrien+d">girlfrien d</a>, <a
|
||||
href="/video/search?search=parking">parking</a>, <a
|
||||
href="/video/search?search=sex">sex</a>, <a
|
||||
href="/video/search?search=gagging">gagging</a>, <a
|
||||
href="/video?c=13">blowjob</a>, <a
|
||||
href="/video/search?search=bj">bj</a>, <a
|
||||
href="/video/search?search=double">double</a>, <a
|
||||
href="/video/search?search=ass">ass</a>
|
||||
<div class="tooltipTrig suggestBtn" data-title="Suggest Tags">
|
||||
<a id="tagLink" class="add-btn-small">+ <span>Suggest</span></a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row showLess">
|
||||
Added on: <span class="white">2 months ago</span>
|
||||
</div>
|
||||
|
||||
<div class="video-info-row showLess">
|
||||
Featured on: <span class="white">1 month ago</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="video-action-tab jump-to-tab">
|
||||
<div class="title">Jump to your favorite action</div>
|
||||
|
||||
<div class="filters mainFilter float-right">
|
||||
<div class="dropdownTrigger">
|
||||
<div>
|
||||
<span class="textFilter" id="tagSort">Sequence</span>
|
||||
<span class="arrowFilters"></span>
|
||||
</div>
|
||||
<ul class="filterListItem dropdownWrapper">
|
||||
<li class="active"><a class="actionTagSort"
|
||||
data-sort="seconds">Sequence</a></li>
|
||||
<li><a class="actionTagSort" data-sort="tag">Alphabetical</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="reset"></div>
|
||||
<div class="display-grid col-4 gap-row-none sortBy seconds">
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
|
||||
Blowjob </a>
|
||||
|
||||
<var>14:22</var>
|
||||
</li>
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
|
||||
Reverse Cowgirl </a>
|
||||
|
||||
<var>18:37</var>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
||||
Cowgirl </a>
|
||||
|
||||
<var>19:42</var>
|
||||
</li>
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
||||
Cowgirl </a>
|
||||
|
||||
<var>27:05</var>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
|
||||
Doggystyle </a>
|
||||
|
||||
<var>30:22</var>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
<div class="display-grid col-4 gap-row-none sortBy tag">
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(862), ga('send', 'event', 'Video Page', 'click', 'Jump to Blowjob');">
|
||||
Blowjob </a>
|
||||
|
||||
<var>14:22</var>
|
||||
</li>
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1117), ga('send', 'event', 'Video Page', 'click', 'Jump to Reverse Cowgirl');">
|
||||
Reverse Cowgirl </a>
|
||||
|
||||
<var>18:37</var>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1182), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
||||
Cowgirl </a>
|
||||
|
||||
<var>19:42</var>
|
||||
</li>
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1625), ga('send', 'event', 'Video Page', 'click', 'Jump to Cowgirl');">
|
||||
Cowgirl </a>
|
||||
|
||||
<var>27:05</var>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="actionTagList full-width margin-none">
|
||||
<li>
|
||||
<a class="js-triggerJumpCat"
|
||||
onclick="jumpToAction(1822), ga('send', 'event', 'Video Page', 'click', 'Jump to Doggystyle');">
|
||||
Doggystyle </a>
|
||||
|
||||
<var>30:22</var>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
func makeSceneXPathConfig() xpathScraper {
|
||||
common := make(commonXPathConfig)
|
||||
|
||||
common["$performerElem"] = `//div[@class="pornstarsWrapper"]/a[@data-mxptype="Pornstar"]`
|
||||
common["$studioElem"] = `//div[@data-type="channel"]/a`
|
||||
|
||||
config := make(xpathScraperConfig)
|
||||
|
||||
config["Title"] = `//meta[@property="og:title"]/@content`
|
||||
// this needs post-processing
|
||||
config["Date"] = `//script[@type="application/ld+json"]`
|
||||
|
||||
tagConfig := make(map[interface{}]interface{})
|
||||
tagConfig["Name"] = `//div[@class="categoriesWrapper"]//a[not(@class="add-btn-small ")]`
|
||||
config["Tags"] = tagConfig
|
||||
|
||||
performerConfig := make(map[interface{}]interface{})
|
||||
performerConfig["Name"] = `$performerElem/@data-mxptext`
|
||||
performerConfig["URL"] = `$performerElem/@href`
|
||||
config["Performers"] = performerConfig
|
||||
|
||||
studioConfig := make(map[interface{}]interface{})
|
||||
studioConfig["Name"] = `$studioElem`
|
||||
studioConfig["URL"] = `$studioElem/@href`
|
||||
config["Studio"] = studioConfig
|
||||
|
||||
scraper := xpathScraper{
|
||||
Scene: config,
|
||||
Common: common,
|
||||
}
|
||||
|
||||
return scraper
|
||||
}
|
||||
|
||||
func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.ScrapedSceneTag) {
|
||||
t.Helper()
|
||||
|
||||
i := 0
|
||||
for i < len(expectedTagNames) || i < len(actualTags) {
|
||||
expectedTag := ""
|
||||
actualTag := ""
|
||||
if i < len(expectedTagNames) {
|
||||
expectedTag = expectedTagNames[i]
|
||||
}
|
||||
if i < len(actualTags) {
|
||||
actualTag = actualTags[i].Name
|
||||
}
|
||||
|
||||
if expectedTag != actualTag {
|
||||
t.Errorf("Expected tag %s, got %s", expectedTag, actualTag)
|
||||
}
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) {
|
||||
t.Helper()
|
||||
|
||||
i := 0
|
||||
for i < len(expectedNames) || i < len(actualPerformers) {
|
||||
expectedName := ""
|
||||
actualName := ""
|
||||
expectedURL := ""
|
||||
actualURL := ""
|
||||
if i < len(expectedNames) {
|
||||
expectedName = expectedNames[i]
|
||||
}
|
||||
if i < len(expectedURLs) {
|
||||
expectedURL = expectedURLs[i]
|
||||
}
|
||||
if i < len(actualPerformers) {
|
||||
actualName = actualPerformers[i].Name
|
||||
if actualPerformers[i].URL != nil {
|
||||
actualURL = *actualPerformers[i].URL
|
||||
}
|
||||
}
|
||||
|
||||
if expectedName != actualName {
|
||||
t.Errorf("Expected performer name %s, got %s", expectedName, actualName)
|
||||
}
|
||||
if expectedURL != actualURL {
|
||||
t.Errorf("Expected perfromer URL %s, got %s", expectedName, actualName)
|
||||
}
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySceneXPathConfig(t *testing.T) {
|
||||
reader := strings.NewReader(sceneHTML)
|
||||
doc, err := htmlquery.Parse(reader)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Error loading document: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
scraper := makeSceneXPathConfig()
|
||||
|
||||
scene, err := scraper.scrapeScene(doc)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Error scraping scene: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
const title = "Test Video"
|
||||
|
||||
verifyField(t, title, scene.Title, "Title")
|
||||
|
||||
// verify tags
|
||||
expectedTags := []string{
|
||||
"Amateur",
|
||||
"Babe",
|
||||
"Blowjob",
|
||||
"Exclusive",
|
||||
"HD Porn",
|
||||
"Pornstar",
|
||||
"Public",
|
||||
"Pussy Licking",
|
||||
"Threesome",
|
||||
"Verified Models",
|
||||
}
|
||||
verifyTags(t, expectedTags, scene.Tags)
|
||||
|
||||
expectedPerformerNames := []string{
|
||||
"Alex D",
|
||||
"Mia Malkova",
|
||||
"Riley Reid",
|
||||
}
|
||||
|
||||
expectedPerformerURLs := []string{
|
||||
"/pornstar/alex-d",
|
||||
"/pornstar/mia-malkova",
|
||||
"/pornstar/riley-reid",
|
||||
}
|
||||
|
||||
verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, scene.Performers)
|
||||
|
||||
const expectedStudioName = "Sis Loves Me"
|
||||
const expectedStudioURL = "/channels/sis-loves-me"
|
||||
|
||||
verifyField(t, expectedStudioName, &scene.Studio.Name, "Studio.Name")
|
||||
verifyField(t, expectedStudioURL, scene.Studio.URL, "Studio.URL")
|
||||
}
|
||||
|
||||
func TestLoadXPathScraperFromYAML(t *testing.T) {
|
||||
const yamlStr = `name: Test
|
||||
performerByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- test.com
|
||||
scraper: performerScraper
|
||||
xPathScrapers:
|
||||
performerScraper:
|
||||
performer:
|
||||
name: //h1[@itemprop="name"]
|
||||
`
|
||||
|
||||
config := &scraperConfig{}
|
||||
err := yaml.Unmarshal([]byte(yamlStr), &config)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Error loading yaml: %s", err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user