Add "split" xpath in post-processing , newlines in replace support (#579)

This commit is contained in:
bnkai
2020-06-18 03:47:10 +03:00
committed by GitHub
parent 3fbb4cdc32
commit 9d0522f62d
4 changed files with 109 additions and 4 deletions

View File

@@ -75,6 +75,10 @@ func (c xpathRegexConfig) apply(value string) string {
} }
ret := re.ReplaceAllString(value, with) ret := re.ReplaceAllString(value, with)
// replace lines if needed to protect from commonPostprocess
if with == "\n" {
ret = replaceLines(ret)
}
logger.Debugf(`Replace: '%s' with '%s'`, regex, with) logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
logger.Debugf("Before: %s", value) logger.Debugf("Before: %s", value)
@@ -94,6 +98,9 @@ func (c xpathRegexConfigs) apply(value string) string {
// remove whitespace again // remove whitespace again
value = commonPostProcess(value) value = commonPostProcess(value)
// restore replaced lines
value = restoreLines(value)
return value return value
} }
@@ -129,6 +136,15 @@ func (c xpathScraperAttrConfig) getParseDate() string {
return c.getString(parseDateKey) return c.getString(parseDateKey)
} }
func (c xpathScraperAttrConfig) getSplit() string {
const splitKey = "split"
return c.getString(splitKey)
}
func (c xpathScraperAttrConfig) hasSplit() bool {
return c.getSplit() != ""
}
func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs { func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
const replaceKey = "replace" const replaceKey = "replace"
val, _ := c[replaceKey] val, _ := c[replaceKey]
@@ -198,6 +214,36 @@ func (c xpathScraperAttrConfig) parseDate(value string) string {
return parsedValue.Format(internalDateFormat) return parsedValue.Format(internalDateFormat)
} }
func (c xpathScraperAttrConfig) splitString(value string) []string {
separator := c.getSplit()
var res []string
if separator == "" {
return []string{value}
}
for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}
return res
}
// setKeyAndSplit sets the key "k" for the results "ret" and splits if needed
// "i" is the index starting position
func (c xpathScraperAttrConfig) setKeyAndSplit(ret *xPathResults, value string, k string, i int) {
if c.hasSplit() {
for j, txt := range c.splitString(value) {
*ret = ret.setKey(j+i, k, txt)
}
} else {
*ret = ret.setKey(i, k, value)
}
}
func (c xpathScraperAttrConfig) replaceRegex(value string) string { func (c xpathScraperAttrConfig) replaceRegex(value string) string {
replace := c.getReplace() replace := c.getReplace()
return replace.apply(value) return replace.apply(value)
@@ -258,6 +304,24 @@ func commonPostProcess(value string) string {
return value return value
} }
// func replaceLines replaces all newlines ("\n") with alert ("\a")
func replaceLines(value string) string {
re := regexp.MustCompile("\a") // \a shouldn't exist in the string
value = re.ReplaceAllString(value, "") // remove it
re = regexp.MustCompile("\n") // replace newlines with (\a)'s so that they don't get removed by commonPostprocess
value = re.ReplaceAllString(value, "\a")
return value
}
// func restoreLines replaces all alerts ("\a") with newlines ("\n")
func restoreLines(value string) string {
re := regexp.MustCompile("\a")
value = re.ReplaceAllString(value, "\n")
return value
}
func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node { func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
// apply common // apply common
if common != nil { if common != nil {
@@ -299,15 +363,13 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP
if attrConfig.hasConcat() { if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found) result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result) result = attrConfig.postProcess(result)
const i = 0 attrConfig.setKeyAndSplit(&ret, result, k, 0)
ret = ret.setKey(i, k, result)
} else { } else {
for i, elem := range found { for i, elem := range found {
text := NodeText(elem) text := NodeText(elem)
text = commonPostProcess(text) text = commonPostProcess(text)
text = attrConfig.postProcess(text) text = attrConfig.postProcess(text)
attrConfig.setKeyAndSplit(&ret, text, k, i)
ret = ret.setKey(i, k, text)
} }
} }
} }

View File

@@ -663,6 +663,14 @@ func makeSceneXPathConfig() xpathScraper {
studioConfig["URL"] = `$studioElem/@href` studioConfig["URL"] = `$studioElem/@href`
config["Studio"] = studioConfig config["Studio"] = studioConfig
const sep = " "
moviesNameConfig := make(map[interface{}]interface{})
moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title`
moviesNameConfig["split"] = sep
moviesConfig := make(map[interface{}]interface{})
moviesConfig["Name"] = moviesNameConfig
config["Movies"] = moviesConfig
scraper := xpathScraper{ scraper := xpathScraper{
Scene: config, Scene: config,
Common: common, Common: common,
@@ -692,6 +700,27 @@ func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.Sc
} }
} }
func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) {
t.Helper()
i := 0
for i < len(expectedMovieNames) || i < len(actualMovies) {
expectedMovie := ""
actualMovie := ""
if i < len(expectedMovieNames) {
expectedMovie = expectedMovieNames[i]
}
if i < len(actualMovies) {
actualMovie = actualMovies[i].Name
}
if expectedMovie != actualMovie {
t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie)
}
i++
}
}
func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) { func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) {
t.Helper() t.Helper()
@@ -761,6 +790,15 @@ func TestApplySceneXPathConfig(t *testing.T) {
} }
verifyTags(t, expectedTags, scene.Tags) verifyTags(t, expectedTags, scene.Tags)
// verify movies
expectedMovies := []string{
"Video",
"of",
"verified",
"member",
}
verifyMovies(t, expectedMovies, scene.Movies)
expectedPerformerNames := []string{ expectedPerformerNames := []string{
"Alex D", "Alex D",
"Mia Malkova", "Mia Malkova",

View File

@@ -6,6 +6,7 @@ const markup = `
* Add support for parent/child studios. * Add support for parent/child studios.
### 🎨 Improvements ### 🎨 Improvements
* Add split xpath post-processing action.
* Improved the layout of the scene page. * Improved the layout of the scene page.
* Show rating as stars in scene page. * Show rating as stars in scene page.
* Add reload scrapers button. * Add reload scrapers button.

View File

@@ -488,3 +488,7 @@ div.dropdown-menu {
text-transform: uppercase; text-transform: uppercase;
} }
} }
.pre {
white-space: pre-line;
}