Scrape tag exclusions (#1617)

* Add config option for scraper tag exclusion patterns

Add a config option for exclusing tags / tag patterns from the scraper
results.

* Handle tag exclusion patterns during scraping
This commit is contained in:
gitgiggety
2021-08-10 06:07:01 +02:00
committed by GitHub
parent 404eaa32d2
commit dfd55346b2
10 changed files with 114 additions and 27 deletions

View File

@@ -69,6 +69,7 @@ fragment ConfigScrapingData on ConfigScrapingResult {
scraperUserAgent scraperUserAgent
scraperCertCheck scraperCertCheck
scraperCDPPath scraperCDPPath
excludeTagPatterns
} }
fragment ConfigData on ConfigResult { fragment ConfigData on ConfigResult {

View File

@@ -255,6 +255,8 @@ input ConfigScrapingInput {
scraperCDPPath: String scraperCDPPath: String
"""Whether the scraper should check for invalid certificates""" """Whether the scraper should check for invalid certificates"""
scraperCertCheck: Boolean! scraperCertCheck: Boolean!
"""Tags blacklist during scraping"""
excludeTagPatterns: [String!]
} }
type ConfigScrapingResult { type ConfigScrapingResult {
@@ -264,6 +266,8 @@ type ConfigScrapingResult {
scraperCDPPath: String scraperCDPPath: String
"""Whether the scraper should check for invalid certificates""" """Whether the scraper should check for invalid certificates"""
scraperCertCheck: Boolean! scraperCertCheck: Boolean!
"""Tags blacklist during scraping"""
excludeTagPatterns: [String!]!
} }
"""All configuration settings""" """All configuration settings"""

View File

@@ -312,6 +312,10 @@ func (r *mutationResolver) ConfigureScraping(ctx context.Context, input models.C
refreshScraperCache = true refreshScraperCache = true
} }
if input.ExcludeTagPatterns != nil {
c.Set(config.ScraperExcludeTagPatterns, input.ExcludeTagPatterns)
}
c.Set(config.ScraperCertCheck, input.ScraperCertCheck) c.Set(config.ScraperCertCheck, input.ScraperCertCheck)
if refreshScraperCache { if refreshScraperCache {
manager.GetInstance().RefreshScraperCache() manager.GetInstance().RefreshScraperCache()

View File

@@ -147,5 +147,6 @@ func makeConfigScrapingResult() *models.ConfigScrapingResult {
ScraperUserAgent: &scraperUserAgent, ScraperUserAgent: &scraperUserAgent,
ScraperCertCheck: config.GetScraperCertCheck(), ScraperCertCheck: config.GetScraperCertCheck(),
ScraperCDPPath: &scraperCDPPath, ScraperCDPPath: &scraperCDPPath,
ExcludeTagPatterns: config.GetScraperExcludeTagPatterns(),
} }
} }

View File

@@ -95,6 +95,7 @@ const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent" const ScraperUserAgent = "scraper_user_agent"
const ScraperCertCheck = "scraper_cert_check" const ScraperCertCheck = "scraper_cert_check"
const ScraperCDPPath = "scraper_cdp_path" const ScraperCDPPath = "scraper_cdp_path"
const ScraperExcludeTagPatterns = "scraper_exclude_tag_patterns"
// stash-box options // stash-box options
const StashBoxes = "stash_boxes" const StashBoxes = "stash_boxes"
@@ -368,6 +369,15 @@ func (i *Instance) GetScraperCertCheck() bool {
return ret return ret
} }
func (i *Instance) GetScraperExcludeTagPatterns() []string {
var ret []string
if viper.IsSet(ScraperExcludeTagPatterns) {
ret = viper.GetStringSlice(ScraperExcludeTagPatterns)
}
return ret
}
func (i *Instance) GetStashBoxes() []*models.StashBox { func (i *Instance) GetStashBoxes() []*models.StashBox {
var boxes []*models.StashBox var boxes []*models.StashBox
viper.UnmarshalKey(StashBoxes, &boxes) viper.UnmarshalKey(StashBoxes, &boxes)

View File

@@ -5,10 +5,12 @@ import (
"errors" "errors"
"os" "os"
"path/filepath" "path/filepath"
"regexp"
"strconv" "strconv"
"strings" "strings"
"github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/logger"
stash_config "github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils" "github.com/stashapp/stash/pkg/utils"
) )
@@ -239,12 +241,11 @@ func (c Cache) postScrapePerformer(ret *models.ScrapedPerformer) error {
if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error { if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
tqb := r.Tag() tqb := r.Tag()
for _, t := range ret.Tags { tags, err := postProcessTags(tqb, ret.Tags)
err := MatchScrapedSceneTag(tqb, t)
if err != nil { if err != nil {
return err return err
} }
} ret.Tags = tags
return nil return nil
}); err != nil { }); err != nil {
@@ -263,12 +264,11 @@ func (c Cache) postScrapeScenePerformer(ret *models.ScrapedScenePerformer) error
if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error { if err := c.txnManager.WithReadTxn(context.TODO(), func(r models.ReaderRepository) error {
tqb := r.Tag() tqb := r.Tag()
for _, t := range ret.Tags { tags, err := postProcessTags(tqb, ret.Tags)
err := MatchScrapedSceneTag(tqb, t)
if err != nil { if err != nil {
return err return err
} }
} ret.Tags = tags
return nil return nil
}); err != nil { }); err != nil {
@@ -302,12 +302,11 @@ func (c Cache) postScrapeScene(ret *models.ScrapedScene) error {
} }
} }
for _, t := range ret.Tags { tags, err := postProcessTags(tqb, ret.Tags)
err := MatchScrapedSceneTag(tqb, t)
if err != nil { if err != nil {
return err return err
} }
} ret.Tags = tags
if ret.Studio != nil { if ret.Studio != nil {
err := MatchScrapedSceneStudio(sqb, ret.Studio) err := MatchScrapedSceneStudio(sqb, ret.Studio)
@@ -342,12 +341,11 @@ func (c Cache) postScrapeGallery(ret *models.ScrapedGallery) error {
} }
} }
for _, t := range ret.Tags { tags, err := postProcessTags(tqb, ret.Tags)
err := MatchScrapedSceneTag(tqb, t)
if err != nil { if err != nil {
return err return err
} }
} ret.Tags = tags
if ret.Studio != nil { if ret.Studio != nil {
err := MatchScrapedSceneStudio(sqb, ret.Studio) err := MatchScrapedSceneStudio(sqb, ret.Studio)
@@ -509,3 +507,42 @@ func (c Cache) ScrapeMovieURL(url string) (*models.ScrapedMovie, error) {
return nil, nil return nil, nil
} }
func postProcessTags(tqb models.TagReader, scrapedTags []*models.ScrapedSceneTag) ([]*models.ScrapedSceneTag, error) {
var ret []*models.ScrapedSceneTag
excludePatterns := stash_config.GetInstance().GetScraperExcludeTagPatterns()
var excludeRegexps []*regexp.Regexp
for _, excludePattern := range excludePatterns {
reg, err := regexp.Compile(strings.ToLower(excludePattern))
if err != nil {
logger.Errorf("Invalid tag exclusion pattern :%v", err)
} else {
excludeRegexps = append(excludeRegexps, reg)
}
}
var ignoredTags []string
ScrapeTag:
for _, t := range scrapedTags {
for _, reg := range excludeRegexps {
if reg.MatchString(strings.ToLower(t.Name)) {
ignoredTags = append(ignoredTags, t.Name)
continue ScrapeTag
}
}
err := MatchScrapedSceneTag(tqb, t)
if err != nil {
return nil, err
}
ret = append(ret, t)
}
if len(ignoredTags) > 0 {
logger.Infof("Scraping ignored tags: %s", strings.Join(ignoredTags, ", "))
}
return ret, nil
}

View File

@@ -1,4 +1,5 @@
### ✨ New Features ### ✨ New Features
* Support excluding tag patterns when scraping. ([#1617](https://github.com/stashapp/stash/pull/1617))
* Support setting a custom directory for default performer images. ([#1489](https://github.com/stashapp/stash/pull/1489)) * Support setting a custom directory for default performer images. ([#1489](https://github.com/stashapp/stash/pull/1489))
* Added filtering and sorting on scene marker count for tags. ([#1603](https://github.com/stashapp/stash/pull/1603)) * Added filtering and sorting on scene marker count for tags. ([#1603](https://github.com/stashapp/stash/pull/1603))
* Support excluding fields and editing tags when saving from scene tagger view. ([#1605](https://github.com/stashapp/stash/pull/1605)) * Support excluding fields and editing tags when saving from scene tagger view. ([#1605](https://github.com/stashapp/stash/pull/1605))

View File

@@ -17,9 +17,10 @@ import StashConfiguration from "./StashConfiguration";
interface IExclusionPatternsProps { interface IExclusionPatternsProps {
excludes: string[]; excludes: string[];
setExcludes: (value: string[]) => void; setExcludes: (value: string[]) => void;
demo: string;
} }
const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => { export const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => {
function excludeRegexChanged(idx: number, value: string) { function excludeRegexChanged(idx: number, value: string) {
const newExcludes = props.excludes.map((regex, i) => { const newExcludes = props.excludes.map((regex, i) => {
const ret = idx !== i ? regex : value; const ret = idx !== i ? regex : value;
@@ -35,8 +36,7 @@ const ExclusionPatterns: React.FC<IExclusionPatternsProps> = (props) => {
} }
function excludeAddRegex() { function excludeAddRegex() {
const demo = "sample\\.mp4$"; const newExcludes = props.excludes.concat(props.demo);
const newExcludes = props.excludes.concat(demo);
props.setExcludes(newExcludes); props.setExcludes(newExcludes);
} }
@@ -490,7 +490,11 @@ export const SettingsConfigurationPanel: React.FC = () => {
id: "config.general.excluded_video_patterns_head", id: "config.general.excluded_video_patterns_head",
})} })}
</h6> </h6>
<ExclusionPatterns excludes={excludes} setExcludes={setExcludes} /> <ExclusionPatterns
excludes={excludes}
setExcludes={setExcludes}
demo="sample\.mp4$"
/>
<Form.Text className="text-muted"> <Form.Text className="text-muted">
{intl.formatMessage({ {intl.formatMessage({
id: "config.general.excluded_video_patterns_desc", id: "config.general.excluded_video_patterns_desc",
@@ -514,6 +518,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
<ExclusionPatterns <ExclusionPatterns
excludes={imageExcludes} excludes={imageExcludes}
setExcludes={setImageExcludes} setExcludes={setImageExcludes}
demo="sample\.jpg$"
/> />
<Form.Text className="text-muted"> <Form.Text className="text-muted">
{intl.formatMessage({ {intl.formatMessage({

View File

@@ -14,6 +14,7 @@ import { useToast } from "src/hooks";
import { TextUtils } from "src/utils"; import { TextUtils } from "src/utils";
import { CollapseButton, Icon, LoadingIndicator } from "src/components/Shared"; import { CollapseButton, Icon, LoadingIndicator } from "src/components/Shared";
import { ScrapeType } from "src/core/generated-graphql"; import { ScrapeType } from "src/core/generated-graphql";
import { ExclusionPatterns } from "./SettingsConfigurationPanel";
interface IURLList { interface IURLList {
urls: string[]; urls: string[];
@@ -96,6 +97,7 @@ export const SettingsScrapingPanel: React.FC = () => {
undefined undefined
); );
const [scraperCertCheck, setScraperCertCheck] = useState<boolean>(true); const [scraperCertCheck, setScraperCertCheck] = useState<boolean>(true);
const [excludeTagPatterns, setExcludeTagPatterns] = useState<string[]>([]);
const { data, error } = useConfiguration(); const { data, error } = useConfiguration();
@@ -103,6 +105,7 @@ export const SettingsScrapingPanel: React.FC = () => {
scraperUserAgent, scraperUserAgent,
scraperCDPPath, scraperCDPPath,
scraperCertCheck, scraperCertCheck,
excludeTagPatterns,
}); });
useEffect(() => { useEffect(() => {
@@ -113,6 +116,7 @@ export const SettingsScrapingPanel: React.FC = () => {
setScraperUserAgent(conf.scraping.scraperUserAgent ?? undefined); setScraperUserAgent(conf.scraping.scraperUserAgent ?? undefined);
setScraperCDPPath(conf.scraping.scraperCDPPath ?? undefined); setScraperCDPPath(conf.scraping.scraperCDPPath ?? undefined);
setScraperCertCheck(conf.scraping.scraperCertCheck); setScraperCertCheck(conf.scraping.scraperCertCheck);
setExcludeTagPatterns(conf.scraping.excludeTagPatterns);
} }
}, [data, error]); }, [data, error]);
@@ -398,6 +402,24 @@ export const SettingsScrapingPanel: React.FC = () => {
</Form.Group> </Form.Group>
</Form.Group> </Form.Group>
<Form.Group>
<h6>
{intl.formatMessage({
id: "config.scraping.excluded_tag_patterns_head",
})}
</h6>
<ExclusionPatterns
excludes={excludeTagPatterns}
setExcludes={setExcludeTagPatterns}
demo="4K"
/>
<Form.Text className="text-muted">
{intl.formatMessage({
id: "config.scraping.excluded_tag_patterns_desc",
})}
</Form.Text>
</Form.Group>
<hr /> <hr />
<h4>{intl.formatMessage({ id: "config.scraping.scrapers" })}</h4> <h4>{intl.formatMessage({ id: "config.scraping.scrapers" })}</h4>

View File

@@ -243,6 +243,8 @@
"scraping": { "scraping": {
"entity_metadata": "{entityType} Metadata", "entity_metadata": "{entityType} Metadata",
"entity_scrapers": "{entityType} scrapers", "entity_scrapers": "{entityType} scrapers",
"excluded_tag_patterns_desc": "Regexps of tag names to exclude from scraping results",
"excluded_tag_patterns_head": "Excluded Tag Patterns",
"scrapers": "Scrapers", "scrapers": "Scrapers",
"search_by_name": "Search by name", "search_by_name": "Search by name",
"supported_types": "Supported types", "supported_types": "Supported types",