Get distinct values from scraper (#1338)

Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
bnkai
2021-04-29 04:38:55 +03:00
committed by GitHub
parent 502d99de1b
commit 597576f5e6
5 changed files with 73 additions and 2 deletions

View File

@@ -12,6 +12,7 @@ import (
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
"gopkg.in/yaml.v2"
)
@@ -73,7 +74,9 @@ func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrCon
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result, q)
if attrConfig.hasSplit() {
return attrConfig.splitString(result)
results := attrConfig.splitString(result)
results = attrConfig.distinctResults(results)
return results
}
ret = []string{result}
@@ -86,6 +89,7 @@ func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrCon
ret = append(ret, text)
}
ret = attrConfig.distinctResults(ret)
}
return ret
@@ -639,6 +643,10 @@ func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
return strings.Join(result, separator)
}
func (c mappedScraperAttrConfig) distinctResults(nodes []string) []string {
return utils.StrUnique(nodes)
}
func (c mappedScraperAttrConfig) splitString(value string) []string {
separator := c.Split
var res []string

View File

@@ -163,6 +163,8 @@ const htmlDoc1 = `
</td>
<td class="paramvalue">
<ul id="socialmedia">
<!-- Adding twitter twice to verify distict post-processing -->
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
<li class="facebook"><a href="https://www.facebook.com/MiaMalcove" target="_blank" alt="Mia Malkova Facebook" title="Mia Malkova Facebook">Facebook</a></li>
<li class="youtube"><a href="https://www.youtube.com/channel/UCEPR0sZKa_ScMoyhemfB7nA" target="_blank" alt="Mia Malkova YouTube" title="Mia Malkova YouTube">YouTube</a></li>
@@ -270,6 +272,12 @@ func makeXPathConfig() mappedPerformerScraperConfig {
}
config.mappedConfig["Weight"] = weightConfig
tagConfig := mappedScraperAttrConfig{
Selector: `//ul[@id="socialmedia"]//a`,
}
config.Tags = make(mappedConfig)
config.Tags["Name"] = tagConfig
return config
}
@@ -348,6 +356,16 @@ func TestScrapePerformerXPath(t *testing.T) {
verifyField(t, details, performer.Details, "Details")
verifyField(t, hairColor, performer.HairColor, "HairColor")
verifyField(t, weight, performer.Weight, "Weight")
expectedTagNames := []string{
"Twitter",
"Facebook",
"YouTube",
"Instagram",
}
for i, expected := range expectedTagNames {
verifyField(t, expected, &performer.Tags[i].Name, "TagName")
}
}
func TestConcatXPath(t *testing.T) {

View File

@@ -35,6 +35,40 @@ func StrMap(vs []string, f func(string) string) []string {
return vsm
}
// StrAppendUnique appends toAdd to the vs string slice if toAdd does not already
// exist in the slice. It returns the new or unchanged string slice.
func StrAppendUnique(vs []string, toAdd string) []string {
if StrInclude(vs, toAdd) {
return vs
}
return append(vs, toAdd)
}
// StrAppendUniques appends a slice of string values to the vs string slice. It only
// appends values that do not already exist in the slice. It returns the new or
// unchanged string slice.
func StrAppendUniques(vs []string, toAdd []string) []string {
for _, v := range toAdd {
vs = StrAppendUnique(vs, v)
}
return vs
}
// StrUnique returns the vs string slice with non-unique values removed.
func StrUnique(vs []string) []string {
distinctValues := make(map[string]struct{})
var ret []string
for _, v := range vs {
if _, exists := distinctValues[v]; !exists {
distinctValues[v] = struct{}{}
ret = append(ret, v)
}
}
return ret
}
// StringSliceToIntSlice converts a slice of strings to a slice of ints.
// Returns an error if any values cannot be parsed.
func StringSliceToIntSlice(ss []string) ([]int, error) {

View File

@@ -13,6 +13,7 @@
* Added scene queue.
### 🎨 Improvements
* Remove duplicate values when scraping lists of elements.
* Improved performance of the auto-tagger.
* Clean generation artifacts after generating each scene.
* Log message at startup when cleaning the `tmp` and `downloads` generated folders takes more than one second.

View File

@@ -389,7 +389,17 @@ Replaces `2001 to 2003` with `2001-2003`.
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
* `split`: Its the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
* `split`: the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
Example:
```yaml
Tags:
Name:
selector: //span[@class="list_attributes"]
split: ","
```
Splits a comma separated list of tags located in the span and returns the tags.
For backwards compatibility, `replace`, `subscraper` and `parseDate` are also allowed as keys for the attribute.