From 9d0522f62d8ea6e36d372cfd231886dae053ee30 Mon Sep 17 00:00:00 2001 From: bnkai <48220860+bnkai@users.noreply.github.com> Date: Thu, 18 Jun 2020 03:47:10 +0300 Subject: [PATCH] Add "split" xpath in post-processing , newlines in replace support (#579) --- pkg/scraper/xpath.go | 70 +++++++++++++++++-- pkg/scraper/xpath_test.go | 38 ++++++++++ .../components/Changelog/versions/v030.tsx | 1 + ui/v2.5/src/index.scss | 4 ++ 4 files changed, 109 insertions(+), 4 deletions(-) diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index b5b99843e..f108dde1c 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -75,6 +75,10 @@ func (c xpathRegexConfig) apply(value string) string { } ret := re.ReplaceAllString(value, with) + // replace lines if needed to protect from commonPostprocess + if with == "\n" { + ret = replaceLines(ret) + } logger.Debugf(`Replace: '%s' with '%s'`, regex, with) logger.Debugf("Before: %s", value) @@ -94,6 +98,9 @@ func (c xpathRegexConfigs) apply(value string) string { // remove whitespace again value = commonPostProcess(value) + // restore replaced lines + + value = restoreLines(value) return value } @@ -129,6 +136,15 @@ func (c xpathScraperAttrConfig) getParseDate() string { return c.getString(parseDateKey) } +func (c xpathScraperAttrConfig) getSplit() string { + const splitKey = "split" + return c.getString(splitKey) +} + +func (c xpathScraperAttrConfig) hasSplit() bool { + return c.getSplit() != "" +} + func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs { const replaceKey = "replace" val, _ := c[replaceKey] @@ -198,6 +214,36 @@ func (c xpathScraperAttrConfig) parseDate(value string) string { return parsedValue.Format(internalDateFormat) } +func (c xpathScraperAttrConfig) splitString(value string) []string { + separator := c.getSplit() + var res []string + + if separator == "" { + return []string{value} + } + + for _, str := range strings.Split(value, separator) { + if str != "" { + res = append(res, str) + } + } + + return res +} + +// setKeyAndSplit sets the key "k" for the results "ret" and splits if needed +// "i" is the index starting position +func (c xpathScraperAttrConfig) setKeyAndSplit(ret *xPathResults, value string, k string, i int) { + if c.hasSplit() { + for j, txt := range c.splitString(value) { + *ret = ret.setKey(j+i, k, txt) + } + } else { + *ret = ret.setKey(i, k, value) + } + +} + func (c xpathScraperAttrConfig) replaceRegex(value string) string { replace := c.getReplace() return replace.apply(value) @@ -258,6 +304,24 @@ func commonPostProcess(value string) string { return value } +// func replaceLines replaces all newlines ("\n") with alert ("\a") +func replaceLines(value string) string { + re := regexp.MustCompile("\a") // \a shouldn't exist in the string + value = re.ReplaceAllString(value, "") // remove it + re = regexp.MustCompile("\n") // replace newlines with (\a)'s so that they don't get removed by commonPostprocess + value = re.ReplaceAllString(value, "\a") + + return value +} + +// func restoreLines replaces all alerts ("\a") with newlines ("\n") +func restoreLines(value string) string { + re := regexp.MustCompile("\a") + value = re.ReplaceAllString(value, "\n") + + return value +} + func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node { // apply common if common != nil { @@ -299,15 +363,13 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP if attrConfig.hasConcat() { result := attrConfig.concatenateResults(found) result = attrConfig.postProcess(result) - const i = 0 - ret = ret.setKey(i, k, result) + attrConfig.setKeyAndSplit(&ret, result, k, 0) } else { for i, elem := range found { text := NodeText(elem) text = commonPostProcess(text) text = attrConfig.postProcess(text) - - ret = ret.setKey(i, k, text) + attrConfig.setKeyAndSplit(&ret, text, k, i) } } } diff --git a/pkg/scraper/xpath_test.go b/pkg/scraper/xpath_test.go index 8fad5f513..76b90952c 100644 --- a/pkg/scraper/xpath_test.go +++ b/pkg/scraper/xpath_test.go @@ -663,6 +663,14 @@ func makeSceneXPathConfig() xpathScraper { studioConfig["URL"] = `$studioElem/@href` config["Studio"] = studioConfig + const sep = " " + moviesNameConfig := make(map[interface{}]interface{}) + moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title` + moviesNameConfig["split"] = sep + moviesConfig := make(map[interface{}]interface{}) + moviesConfig["Name"] = moviesNameConfig + config["Movies"] = moviesConfig + scraper := xpathScraper{ Scene: config, Common: common, @@ -692,6 +700,27 @@ func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.Sc } } +func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) { + t.Helper() + + i := 0 + for i < len(expectedMovieNames) || i < len(actualMovies) { + expectedMovie := "" + actualMovie := "" + if i < len(expectedMovieNames) { + expectedMovie = expectedMovieNames[i] + } + if i < len(actualMovies) { + actualMovie = actualMovies[i].Name + } + + if expectedMovie != actualMovie { + t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie) + } + i++ + } +} + func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) { t.Helper() @@ -761,6 +790,15 @@ func TestApplySceneXPathConfig(t *testing.T) { } verifyTags(t, expectedTags, scene.Tags) + // verify movies + expectedMovies := []string{ + "Video", + "of", + "verified", + "member", + } + verifyMovies(t, expectedMovies, scene.Movies) + expectedPerformerNames := []string{ "Alex D", "Mia Malkova", diff --git a/ui/v2.5/src/components/Changelog/versions/v030.tsx b/ui/v2.5/src/components/Changelog/versions/v030.tsx index bf50920d6..b41a1bd06 100644 --- a/ui/v2.5/src/components/Changelog/versions/v030.tsx +++ b/ui/v2.5/src/components/Changelog/versions/v030.tsx @@ -6,6 +6,7 @@ const markup = ` * Add support for parent/child studios. ### 🎨 Improvements +* Add split xpath post-processing action. * Improved the layout of the scene page. * Show rating as stars in scene page. * Add reload scrapers button. diff --git a/ui/v2.5/src/index.scss b/ui/v2.5/src/index.scss index 35dd8fb4d..95db2dd18 100755 --- a/ui/v2.5/src/index.scss +++ b/ui/v2.5/src/index.scss @@ -488,3 +488,7 @@ div.dropdown-menu { text-transform: uppercase; } } + +.pre { + white-space: pre-line; +}