mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 12:24:38 +03:00
Get distinct values from scraper (#1338)
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/stashapp/stash/pkg/logger"
|
||||
"github.com/stashapp/stash/pkg/models"
|
||||
"github.com/stashapp/stash/pkg/utils"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
@@ -73,7 +74,9 @@ func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrCon
|
||||
result := attrConfig.concatenateResults(found)
|
||||
result = attrConfig.postProcess(result, q)
|
||||
if attrConfig.hasSplit() {
|
||||
return attrConfig.splitString(result)
|
||||
results := attrConfig.splitString(result)
|
||||
results = attrConfig.distinctResults(results)
|
||||
return results
|
||||
}
|
||||
|
||||
ret = []string{result}
|
||||
@@ -86,6 +89,7 @@ func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrCon
|
||||
|
||||
ret = append(ret, text)
|
||||
}
|
||||
ret = attrConfig.distinctResults(ret)
|
||||
}
|
||||
|
||||
return ret
|
||||
@@ -639,6 +643,10 @@ func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
|
||||
return strings.Join(result, separator)
|
||||
}
|
||||
|
||||
func (c mappedScraperAttrConfig) distinctResults(nodes []string) []string {
|
||||
return utils.StrUnique(nodes)
|
||||
}
|
||||
|
||||
func (c mappedScraperAttrConfig) splitString(value string) []string {
|
||||
separator := c.Split
|
||||
var res []string
|
||||
|
||||
@@ -163,6 +163,8 @@ const htmlDoc1 = `
|
||||
</td>
|
||||
<td class="paramvalue">
|
||||
<ul id="socialmedia">
|
||||
<!-- Adding twitter twice to verify distict post-processing -->
|
||||
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
|
||||
<li class="twitter"><a href="https://twitter.com/MiaMalkova" target="_blank" alt="Mia Malkova Twitter" title="Mia Malkova Twitter">Twitter</a></li>
|
||||
<li class="facebook"><a href="https://www.facebook.com/MiaMalcove" target="_blank" alt="Mia Malkova Facebook" title="Mia Malkova Facebook">Facebook</a></li>
|
||||
<li class="youtube"><a href="https://www.youtube.com/channel/UCEPR0sZKa_ScMoyhemfB7nA" target="_blank" alt="Mia Malkova YouTube" title="Mia Malkova YouTube">YouTube</a></li>
|
||||
@@ -270,6 +272,12 @@ func makeXPathConfig() mappedPerformerScraperConfig {
|
||||
}
|
||||
config.mappedConfig["Weight"] = weightConfig
|
||||
|
||||
tagConfig := mappedScraperAttrConfig{
|
||||
Selector: `//ul[@id="socialmedia"]//a`,
|
||||
}
|
||||
config.Tags = make(mappedConfig)
|
||||
config.Tags["Name"] = tagConfig
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
@@ -348,6 +356,16 @@ func TestScrapePerformerXPath(t *testing.T) {
|
||||
verifyField(t, details, performer.Details, "Details")
|
||||
verifyField(t, hairColor, performer.HairColor, "HairColor")
|
||||
verifyField(t, weight, performer.Weight, "Weight")
|
||||
|
||||
expectedTagNames := []string{
|
||||
"Twitter",
|
||||
"Facebook",
|
||||
"YouTube",
|
||||
"Instagram",
|
||||
}
|
||||
for i, expected := range expectedTagNames {
|
||||
verifyField(t, expected, &performer.Tags[i].Name, "TagName")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConcatXPath(t *testing.T) {
|
||||
|
||||
@@ -35,6 +35,40 @@ func StrMap(vs []string, f func(string) string) []string {
|
||||
return vsm
|
||||
}
|
||||
|
||||
// StrAppendUnique appends toAdd to the vs string slice if toAdd does not already
|
||||
// exist in the slice. It returns the new or unchanged string slice.
|
||||
func StrAppendUnique(vs []string, toAdd string) []string {
|
||||
if StrInclude(vs, toAdd) {
|
||||
return vs
|
||||
}
|
||||
|
||||
return append(vs, toAdd)
|
||||
}
|
||||
|
||||
// StrAppendUniques appends a slice of string values to the vs string slice. It only
|
||||
// appends values that do not already exist in the slice. It returns the new or
|
||||
// unchanged string slice.
|
||||
func StrAppendUniques(vs []string, toAdd []string) []string {
|
||||
for _, v := range toAdd {
|
||||
vs = StrAppendUnique(vs, v)
|
||||
}
|
||||
|
||||
return vs
|
||||
}
|
||||
|
||||
// StrUnique returns the vs string slice with non-unique values removed.
|
||||
func StrUnique(vs []string) []string {
|
||||
distinctValues := make(map[string]struct{})
|
||||
var ret []string
|
||||
for _, v := range vs {
|
||||
if _, exists := distinctValues[v]; !exists {
|
||||
distinctValues[v] = struct{}{}
|
||||
ret = append(ret, v)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// StringSliceToIntSlice converts a slice of strings to a slice of ints.
|
||||
// Returns an error if any values cannot be parsed.
|
||||
func StringSliceToIntSlice(ss []string) ([]int, error) {
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
* Added scene queue.
|
||||
|
||||
### 🎨 Improvements
|
||||
* Remove duplicate values when scraping lists of elements.
|
||||
* Improved performance of the auto-tagger.
|
||||
* Clean generation artifacts after generating each scene.
|
||||
* Log message at startup when cleaning the `tmp` and `downloads` generated folders takes more than one second.
|
||||
|
||||
@@ -389,7 +389,17 @@ Replaces `2001 to 2003` with `2001-2003`.
|
||||
|
||||
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
|
||||
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
|
||||
* `split`: Its the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
|
||||
* `split`: the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
|
||||
|
||||
Example:
|
||||
```yaml
|
||||
Tags:
|
||||
Name:
|
||||
selector: //span[@class="list_attributes"]
|
||||
split: ","
|
||||
```
|
||||
Splits a comma separated list of tags located in the span and returns the tags.
|
||||
|
||||
|
||||
For backwards compatibility, `replace`, `subscraper` and `parseDate` are also allowed as keys for the attribute.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user