Scraper refactor middle (#2043)

* Push scrapeByURL into scrapers

Replace ScrapePerfomerByURL, ScrapeMovie..., ... with ScrapeByURL in
the scraperActionImpl interface. This allows us to delete a lot of
repeated code in the scrapers and replace the central part with a
switch on the scraper type.

* Fold name scraping into one call

Follow up on scraper refactoring. Name scrapers use the same code path.
This allows us to restructure some code and kill some functions, adding
variance to the name scraping code. It allows us to remove some code
repetition as well.

* Do not export loop refs.

* Simplify fragment scraping

Generalize fragment scrapers into ScrapeByFragment. This simplifies
fragment code flows into a simpler pathing which should be easier
to handle in the future.

* Eliminate more context.TODO()

In a number of cases, we have a context now. Use the context rather than
TODO() for those cases in order to make those operations cancellable.

* Pass the context for the stashbox scraper

This removes all context.TODO() in the path of the stashbox scraper,
and replaces it with the context that's present on each of the paths.

* Pass the context into subscrapers

Mostly a mechanical update, where we pass in the context for
subscraping. This removes the final context.TODO() in the scraper
code.

* Warn on unknown fields from scripts

A common mistake for new script writers are that they return fields
not known to stash. For instance the name "description" is used rather
than "details".

Decode disallowing unknown fields. If this fails, use a tee-reader to
fall back to the old behavior, but print a warning for the user in this
case. Thus, we retain the old behavior, but print warnings for scripts
which fails the more strict unknown-fields detection.

* Nil-check before running the postprocessing chain

Fixes panics when scraping returns nil values.

* Lift nil-ness in post-postprocessing

If the struct we are trying to post-process is nil, we shouldn't
enter the postprocessing flow at all. Pass the struct as a value
rather than a pointer, eliminating nil-checks as we go. Use the
top-level postProcess call to make the nil-check and then abort there
if the object we are looking at is nil.

* Allow conversion routines to handle values

If we have a non-pointer type in the interface, we should also convert
those into ScrapedContent. Otherwise we get errors on deprecated
functions.
This commit is contained in:
SmallCoccinelle
2021-11-26 01:20:06 +01:00
committed by GitHub
parent 19e69f5310
commit 4089fcf1e2
20 changed files with 442 additions and 548 deletions

View File

@@ -1,6 +1,7 @@
package scraper
import (
"context"
"errors"
"fmt"
"math"
@@ -18,7 +19,7 @@ import (
type mappedQuery interface {
runQuery(selector string) ([]string, error)
subScrape(value string) mappedQuery
subScrape(ctx context.Context, value string) mappedQuery
}
type commonMappedConfig map[string]string
@@ -38,7 +39,7 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
return ret
}
func (s mappedConfig) process(q mappedQuery, common commonMappedConfig) mappedResults {
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig) mappedResults {
var ret mappedResults
for k, attrConfig := range s {
@@ -57,7 +58,7 @@ func (s mappedConfig) process(q mappedQuery, common commonMappedConfig) mappedRe
}
if len(found) > 0 {
result := s.postProcess(q, attrConfig, found)
result := s.postProcess(ctx, q, attrConfig, found)
for i, text := range result {
ret = ret.setKey(i, k, text)
}
@@ -68,12 +69,12 @@ func (s mappedConfig) process(q mappedQuery, common commonMappedConfig) mappedRe
return ret
}
func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrConfig, found []string) []string {
func (s mappedConfig) postProcess(ctx context.Context, q mappedQuery, attrConfig mappedScraperAttrConfig, found []string) []string {
// check if we're concatenating the results into a single result
var ret []string
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result, q)
result = attrConfig.postProcess(ctx, result, q)
if attrConfig.hasSplit() {
results := attrConfig.splitString(result)
results = attrConfig.cleanResults(results)
@@ -83,7 +84,7 @@ func (s mappedConfig) postProcess(q mappedQuery, attrConfig mappedScraperAttrCon
ret = []string{result}
} else {
for _, text := range found {
text = attrConfig.postProcess(text, q)
text = attrConfig.postProcess(ctx, text, q)
if attrConfig.hasSplit() {
return attrConfig.splitString(text)
}
@@ -359,12 +360,12 @@ func (c mappedRegexConfigs) apply(value string) string {
}
type postProcessAction interface {
Apply(value string, q mappedQuery) string
Apply(ctx context.Context, value string, q mappedQuery) string
}
type postProcessParseDate string
func (p *postProcessParseDate) Apply(value string, q mappedQuery) string {
func (p *postProcessParseDate) Apply(ctx context.Context, value string, q mappedQuery) string {
parseDate := string(*p)
const internalDateFormat = "2006-01-02"
@@ -396,7 +397,7 @@ func (p *postProcessParseDate) Apply(value string, q mappedQuery) string {
type postProcessSubtractDays bool
func (p *postProcessSubtractDays) Apply(value string, q mappedQuery) string {
func (p *postProcessSubtractDays) Apply(ctx context.Context, value string, q mappedQuery) string {
const internalDateFormat = "2006-01-02"
i, err := strconv.Atoi(value)
@@ -412,18 +413,18 @@ func (p *postProcessSubtractDays) Apply(value string, q mappedQuery) string {
type postProcessReplace mappedRegexConfigs
func (c *postProcessReplace) Apply(value string, q mappedQuery) string {
func (c *postProcessReplace) Apply(ctx context.Context, value string, q mappedQuery) string {
replace := mappedRegexConfigs(*c)
return replace.apply(value)
}
type postProcessSubScraper mappedScraperAttrConfig
func (p *postProcessSubScraper) Apply(value string, q mappedQuery) string {
func (p *postProcessSubScraper) Apply(ctx context.Context, value string, q mappedQuery) string {
subScrapeConfig := mappedScraperAttrConfig(*p)
logger.Debugf("Sub-scraping for: %s", value)
ss := q.subScrape(value)
ss := q.subScrape(ctx, value)
if ss != nil {
found, err := ss.runQuery(subScrapeConfig.Selector)
@@ -440,7 +441,7 @@ func (p *postProcessSubScraper) Apply(value string, q mappedQuery) string {
result = found[0]
}
result = subScrapeConfig.postProcess(result, ss)
result = subScrapeConfig.postProcess(ctx, result, ss)
return result
}
}
@@ -450,7 +451,7 @@ func (p *postProcessSubScraper) Apply(value string, q mappedQuery) string {
type postProcessMap map[string]string
func (p *postProcessMap) Apply(value string, q mappedQuery) string {
func (p *postProcessMap) Apply(ctx context.Context, value string, q mappedQuery) string {
// return the mapped value if present
m := *p
mapped, ok := m[value]
@@ -464,7 +465,7 @@ func (p *postProcessMap) Apply(value string, q mappedQuery) string {
type postProcessFeetToCm bool
func (p *postProcessFeetToCm) Apply(value string, q mappedQuery) string {
func (p *postProcessFeetToCm) Apply(ctx context.Context, value string, q mappedQuery) string {
const foot_in_cm = 30.48
const inch_in_cm = 2.54
@@ -488,7 +489,7 @@ func (p *postProcessFeetToCm) Apply(value string, q mappedQuery) string {
type postProcessLbToKg bool
func (p *postProcessLbToKg) Apply(value string, q mappedQuery) string {
func (p *postProcessLbToKg) Apply(ctx context.Context, value string, q mappedQuery) string {
const lb_in_kg = 0.45359237
w, err := strconv.ParseFloat(value, 64)
if err == nil {
@@ -690,9 +691,9 @@ func (c mappedScraperAttrConfig) splitString(value string) []string {
return res
}
func (c mappedScraperAttrConfig) postProcess(value string, q mappedQuery) string {
func (c mappedScraperAttrConfig) postProcess(ctx context.Context, value string, q mappedQuery) string {
for _, action := range c.postProcessActions {
value = action.Apply(value, q)
value = action.Apply(ctx, value, q)
}
return value
@@ -748,7 +749,7 @@ func (r mappedResults) setKey(index int, key string, value string) mappedResults
return r
}
func (s mappedScraper) scrapePerformer(q mappedQuery) (*models.ScrapedPerformer, error) {
func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer
performerMap := s.Performer
@@ -758,14 +759,14 @@ func (s mappedScraper) scrapePerformer(q mappedQuery) (*models.ScrapedPerformer,
performerTagsMap := performerMap.Tags
results := performerMap.process(q, s.Common)
results := performerMap.process(ctx, q, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the tags
if performerTagsMap != nil {
logger.Debug(`Processing performer tags:`)
tagResults := performerTagsMap.process(q, s.Common)
tagResults := performerTagsMap.process(ctx, q, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedTag{}
@@ -778,7 +779,7 @@ func (s mappedScraper) scrapePerformer(q mappedQuery) (*models.ScrapedPerformer,
return &ret, nil
}
func (s mappedScraper) scrapePerformers(q mappedQuery) ([]*models.ScrapedPerformer, error) {
func (s mappedScraper) scrapePerformers(ctx context.Context, q mappedQuery) ([]*models.ScrapedPerformer, error) {
var ret []*models.ScrapedPerformer
performerMap := s.Performer
@@ -786,7 +787,7 @@ func (s mappedScraper) scrapePerformers(q mappedQuery) ([]*models.ScrapedPerform
return nil, nil
}
results := performerMap.process(q, s.Common)
results := performerMap.process(ctx, q, s.Common)
for _, r := range results {
var p models.ScrapedPerformer
r.apply(&p)
@@ -796,7 +797,7 @@ func (s mappedScraper) scrapePerformers(q mappedQuery) ([]*models.ScrapedPerform
return ret, nil
}
func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.ScrapedScene {
func (s mappedScraper) processScene(ctx context.Context, q mappedQuery, r mappedResult) *models.ScrapedScene {
var ret models.ScrapedScene
sceneScraperConfig := s.Scene
@@ -813,13 +814,13 @@ func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.Scrap
// process performer tags once
var performerTagResults mappedResults
if scenePerformerTagsMap != nil {
performerTagResults = scenePerformerTagsMap.process(q, s.Common)
performerTagResults = scenePerformerTagsMap.process(ctx, q, s.Common)
}
// now apply the performers and tags
if scenePerformersMap.mappedConfig != nil {
logger.Debug(`Processing scene performers:`)
performerResults := scenePerformersMap.process(q, s.Common)
performerResults := scenePerformersMap.process(ctx, q, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedPerformer{}
@@ -837,7 +838,7 @@ func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.Scrap
if sceneTagsMap != nil {
logger.Debug(`Processing scene tags:`)
tagResults := sceneTagsMap.process(q, s.Common)
tagResults := sceneTagsMap.process(ctx, q, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedTag{}
@@ -848,7 +849,7 @@ func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.Scrap
if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(q, s.Common)
studioResults := sceneStudioMap.process(ctx, q, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
@@ -859,7 +860,7 @@ func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.Scrap
if sceneMoviesMap != nil {
logger.Debug(`Processing scene movies:`)
movieResults := sceneMoviesMap.process(q, s.Common)
movieResults := sceneMoviesMap.process(ctx, q, s.Common)
for _, p := range movieResults {
movie := &models.ScrapedMovie{}
@@ -871,7 +872,7 @@ func (s mappedScraper) processScene(q mappedQuery, r mappedResult) *models.Scrap
return &ret
}
func (s mappedScraper) scrapeScenes(q mappedQuery) ([]*models.ScrapedScene, error) {
func (s mappedScraper) scrapeScenes(ctx context.Context, q mappedQuery) ([]*models.ScrapedScene, error) {
var ret []*models.ScrapedScene
sceneScraperConfig := s.Scene
@@ -881,16 +882,16 @@ func (s mappedScraper) scrapeScenes(q mappedQuery) ([]*models.ScrapedScene, erro
}
logger.Debug(`Processing scenes:`)
results := sceneMap.process(q, s.Common)
results := sceneMap.process(ctx, q, s.Common)
for _, r := range results {
logger.Debug(`Processing scene:`)
ret = append(ret, s.processScene(q, r))
ret = append(ret, s.processScene(ctx, q, r))
}
return ret, nil
}
func (s mappedScraper) scrapeScene(q mappedQuery) (*models.ScrapedScene, error) {
func (s mappedScraper) scrapeScene(ctx context.Context, q mappedQuery) (*models.ScrapedScene, error) {
var ret models.ScrapedScene
sceneScraperConfig := s.Scene
@@ -900,16 +901,16 @@ func (s mappedScraper) scrapeScene(q mappedQuery) (*models.ScrapedScene, error)
}
logger.Debug(`Processing scene:`)
results := sceneMap.process(q, s.Common)
results := sceneMap.process(ctx, q, s.Common)
if len(results) > 0 {
ss := s.processScene(q, results[0])
ss := s.processScene(ctx, q, results[0])
ret = *ss
}
return &ret, nil
}
func (s mappedScraper) scrapeGallery(q mappedQuery) (*models.ScrapedGallery, error) {
func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*models.ScrapedGallery, error) {
var ret models.ScrapedGallery
galleryScraperConfig := s.Gallery
@@ -923,14 +924,14 @@ func (s mappedScraper) scrapeGallery(q mappedQuery) (*models.ScrapedGallery, err
galleryStudioMap := galleryScraperConfig.Studio
logger.Debug(`Processing gallery:`)
results := galleryMap.process(q, s.Common)
results := galleryMap.process(ctx, q, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
// now apply the performers and tags
if galleryPerformersMap != nil {
logger.Debug(`Processing gallery performers:`)
performerResults := galleryPerformersMap.process(q, s.Common)
performerResults := galleryPerformersMap.process(ctx, q, s.Common)
for _, p := range performerResults {
performer := &models.ScrapedPerformer{}
@@ -941,7 +942,7 @@ func (s mappedScraper) scrapeGallery(q mappedQuery) (*models.ScrapedGallery, err
if galleryTagsMap != nil {
logger.Debug(`Processing gallery tags:`)
tagResults := galleryTagsMap.process(q, s.Common)
tagResults := galleryTagsMap.process(ctx, q, s.Common)
for _, p := range tagResults {
tag := &models.ScrapedTag{}
@@ -952,7 +953,7 @@ func (s mappedScraper) scrapeGallery(q mappedQuery) (*models.ScrapedGallery, err
if galleryStudioMap != nil {
logger.Debug(`Processing gallery studio:`)
studioResults := galleryStudioMap.process(q, s.Common)
studioResults := galleryStudioMap.process(ctx, q, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
@@ -965,7 +966,7 @@ func (s mappedScraper) scrapeGallery(q mappedQuery) (*models.ScrapedGallery, err
return &ret, nil
}
func (s mappedScraper) scrapeMovie(q mappedQuery) (*models.ScrapedMovie, error) {
func (s mappedScraper) scrapeMovie(ctx context.Context, q mappedQuery) (*models.ScrapedMovie, error) {
var ret models.ScrapedMovie
movieScraperConfig := s.Movie
@@ -976,13 +977,13 @@ func (s mappedScraper) scrapeMovie(q mappedQuery) (*models.ScrapedMovie, error)
movieStudioMap := movieScraperConfig.Studio
results := movieMap.process(q, s.Common)
results := movieMap.process(ctx, q, s.Common)
if len(results) > 0 {
results[0].apply(&ret)
if movieStudioMap != nil {
logger.Debug(`Processing movie studio:`)
studioResults := movieStudioMap.process(q, s.Common)
studioResults := movieStudioMap.process(ctx, q, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}