Add Image Scraping (#5562)

Co-authored-by: keenbed <155155956+keenbed@users.noreply.github.com>
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
WeedLordVegeta420
2025-02-24 00:38:14 -05:00
committed by GitHub
parent b6ace42973
commit e97f647a43
27 changed files with 1063 additions and 11 deletions

View File

@@ -63,6 +63,28 @@ type ImageFilterType struct {
UpdatedAt *TimestampCriterionInput `json:"updated_at"`
}
type ImageUpdateInput struct {
ClientMutationID *string `json:"clientMutationId"`
ID string `json:"id"`
Title *string `json:"title"`
Code *string `json:"code"`
Urls []string `json:"urls"`
Date *string `json:"date"`
Details *string `json:"details"`
Photographer *string `json:"photographer"`
Rating100 *int `json:"rating100"`
Organized *bool `json:"organized"`
SceneIds []string `json:"scene_ids"`
StudioID *string `json:"studio_id"`
TagIds []string `json:"tag_ids"`
PerformerIds []string `json:"performer_ids"`
GalleryIds []string `json:"gallery_ids"`
PrimaryFileID *string `json:"primary_file_id"`
// deprecated
URL *string `json:"url"`
}
type ImageDestroyInput struct {
ID string `json:"id"`
DeleteFile *bool `json:"delete_file"`

View File

@@ -31,6 +31,7 @@ type scraperActionImpl interface {
scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*ScrapedScene, error)
scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*ScrapedGallery, error)
scrapeImageByImage(ctx context.Context, image *models.Image) (*ScrapedImage, error)
}
func (c config) getScraper(scraper scraperTypeConfig, client *http.Client, globalConfig GlobalConfig) scraperActionImpl {

View File

@@ -77,11 +77,18 @@ type GalleryFinder interface {
models.URLLoader
}
type ImageFinder interface {
models.ImageGetter
models.FileLoader
models.URLLoader
}
type Repository struct {
TxnManager models.TxnManager
SceneFinder SceneFinder
GalleryFinder GalleryFinder
ImageFinder ImageFinder
TagFinder TagFinder
PerformerFinder PerformerFinder
GroupFinder match.GroupNamesFinder
@@ -93,6 +100,7 @@ func NewRepository(repo models.Repository) Repository {
TxnManager: repo.TxnManager,
SceneFinder: repo.Scene,
GalleryFinder: repo.Gallery,
ImageFinder: repo.Image,
TagFinder: repo.Tag,
PerformerFinder: repo.Performer,
GroupFinder: repo.Group,
@@ -357,6 +365,28 @@ func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty Scrape
return nil, fmt.Errorf("scraper %s: %w", scraperID, err)
}
if scraped != nil {
ret = scraped
}
case ScrapeContentTypeImage:
is, ok := s.(imageScraper)
if !ok {
return nil, fmt.Errorf("%w: cannot use scraper %s as a image scraper", ErrNotSupported, scraperID)
}
scene, err := c.getImage(ctx, id)
if err != nil {
return nil, fmt.Errorf("scraper %s: unable to load image id %v: %w", scraperID, id, err)
}
// don't assign nil concrete pointer to ret interface, otherwise nil
// detection is harder
scraped, err := is.viaImage(ctx, c.client, scene)
if err != nil {
return nil, fmt.Errorf("scraper %s: %w", scraperID, err)
}
if scraped != nil {
ret = scraped
}
@@ -426,3 +456,31 @@ func (c Cache) getGallery(ctx context.Context, galleryID int) (*models.Gallery,
}
return ret, nil
}
func (c Cache) getImage(ctx context.Context, imageID int) (*models.Image, error) {
var ret *models.Image
r := c.repository
if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
qb := r.ImageFinder
var err error
ret, err = qb.Find(ctx, imageID)
if err != nil {
return err
}
if ret == nil {
return fmt.Errorf("image with id %d not found", imageID)
}
err = ret.LoadFiles(ctx, qb)
if err != nil {
return err
}
return ret.LoadURLs(ctx, qb)
}); err != nil {
return nil, err
}
return ret, nil
}

View File

@@ -45,6 +45,13 @@ type config struct {
// Configuration for querying a gallery by a URL
GalleryByURL []*scrapeByURLConfig `yaml:"galleryByURL"`
// Configuration for querying an image by a URL
ImageByURL []*scrapeByURLConfig `yaml:"imageByURL"`
// Configuration for querying image by an Image fragment
ImageByFragment *scraperTypeConfig `yaml:"imageByFragment"`
// Configuration for querying a movie by a URL
// Configuration for querying a movie by a URL - deprecated, use GroupByURL
MovieByURL []*scrapeByURLConfig `yaml:"movieByURL"`
GroupByURL []*scrapeByURLConfig `yaml:"groupByURL"`
@@ -295,6 +302,21 @@ func (c config) spec() Scraper {
ret.Gallery = &gallery
}
image := ScraperSpec{}
if c.ImageByFragment != nil {
image.SupportedScrapes = append(image.SupportedScrapes, ScrapeTypeFragment)
}
if len(c.ImageByURL) > 0 {
image.SupportedScrapes = append(image.SupportedScrapes, ScrapeTypeURL)
for _, v := range c.ImageByURL {
image.Urls = append(image.Urls, v.URL...)
}
}
if len(image.SupportedScrapes) > 0 {
ret.Image = &image
}
group := ScraperSpec{}
if len(c.MovieByURL) > 0 || len(c.GroupByURL) > 0 {
group.SupportedScrapes = append(group.SupportedScrapes, ScrapeTypeURL)
@@ -319,6 +341,8 @@ func (c config) supports(ty ScrapeContentType) bool {
return (c.SceneByName != nil && c.SceneByQueryFragment != nil) || c.SceneByFragment != nil || len(c.SceneByURL) > 0
case ScrapeContentTypeGallery:
return c.GalleryByFragment != nil || len(c.GalleryByURL) > 0
case ScrapeContentTypeImage:
return c.ImageByFragment != nil || len(c.ImageByURL) > 0
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
return len(c.MovieByURL) > 0 || len(c.GroupByURL) > 0
}
@@ -346,6 +370,12 @@ func (c config) matchesURL(url string, ty ScrapeContentType) bool {
return true
}
}
case ScrapeContentTypeImage:
for _, scraper := range c.ImageByURL {
if scraper.matchesURL(url) {
return true
}
}
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
for _, scraper := range c.MovieByURL {
if scraper.matchesURL(url) {

View File

@@ -33,6 +33,9 @@ func (g group) fragmentScraper(input Input) *scraperTypeConfig {
case input.Gallery != nil:
// TODO - this should be galleryByQueryFragment
return g.config.GalleryByFragment
case input.Image != nil:
// TODO - this should be imageByImageFragment
return g.config.ImageByFragment
case input.Scene != nil:
return g.config.SceneByQueryFragment
}
@@ -75,6 +78,15 @@ func (g group) viaGallery(ctx context.Context, client *http.Client, gallery *mod
return s.scrapeGalleryByGallery(ctx, gallery)
}
func (g group) viaImage(ctx context.Context, client *http.Client, gallery *models.Image) (*ScrapedImage, error) {
if g.config.ImageByFragment == nil {
return nil, ErrNotSupported
}
s := g.config.getScraper(*g.config.ImageByFragment, client, g.globalConf)
return s.scrapeImageByImage(ctx, gallery)
}
func loadUrlCandidates(c config, ty ScrapeContentType) []*scrapeByURLConfig {
switch ty {
case ScrapeContentTypePerformer:
@@ -85,6 +97,8 @@ func loadUrlCandidates(c config, ty ScrapeContentType) []*scrapeByURLConfig {
return append(c.MovieByURL, c.GroupByURL...)
case ScrapeContentTypeGallery:
return c.GalleryByURL
case ScrapeContentTypeImage:
return c.ImageByURL
}
panic("loadUrlCandidates: unreachable")

View File

@@ -11,6 +11,28 @@ import (
"github.com/stashapp/stash/pkg/utils"
)
type ScrapedImage struct {
Title *string `json:"title"`
Code *string `json:"code"`
Details *string `json:"details"`
Photographer *string `json:"photographer"`
URLs []string `json:"urls"`
Date *string `json:"date"`
Studio *models.ScrapedStudio `json:"studio"`
Tags []*models.ScrapedTag `json:"tags"`
Performers []*models.ScrapedPerformer `json:"performers"`
}
func (ScrapedImage) IsScrapedContent() {}
type ScrapedImageInput struct {
Title *string `json:"title"`
Code *string `json:"code"`
Details *string `json:"details"`
URLs []string `json:"urls"`
Date *string `json:"date"`
}
func setPerformerImage(ctx context.Context, client *http.Client, p *models.ScrapedPerformer, globalConfig GlobalConfig) error {
// backwards compatibility: we fetch the image if it's a URL and set it to the first image
// Image is deprecated, so only do this if Images is unset

View File

@@ -102,6 +102,12 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont
return nil, err
}
return ret, nil
case ScrapeContentTypeImage:
ret, err := scraper.scrapeImage(ctx, q)
if err != nil || ret == nil {
return nil, err
}
return ret, nil
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
ret, err := scraper.scrapeGroup(ctx, q)
if err != nil || ret == nil {
@@ -225,6 +231,30 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
return scraper.scrapeScene(ctx, q)
}
func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*ScrapedImage, error) {
// construct the URL
queryURL := queryURLParametersFromImage(image)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
scraper := s.getJsonScraper()
if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(ctx, url)
if err != nil {
return nil, err
}
q := s.getJsonQuery(doc)
return scraper.scrapeImage(ctx, q)
}
func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*ScrapedGallery, error) {
// construct the URL
queryURL := queryURLParametersFromGallery(gallery)

View File

@@ -181,6 +181,7 @@ type mappedGalleryScraperConfig struct {
Performers mappedConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
}
type _mappedGalleryScraperConfig mappedGalleryScraperConfig
func (s *mappedGalleryScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
@@ -228,6 +229,60 @@ func (s *mappedGalleryScraperConfig) UnmarshalYAML(unmarshal func(interface{}) e
return nil
}
type mappedImageScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
Performers mappedConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
}
type _mappedImageScraperConfig mappedImageScraperConfig
func (s *mappedImageScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
delete(parentMap, mappedScraperConfigSceneTags)
delete(parentMap, mappedScraperConfigScenePerformers)
delete(parentMap, mappedScraperConfigSceneStudio)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedImageScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedImageScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedPerformerScraperConfig struct {
mappedConfig
@@ -785,6 +840,7 @@ type mappedScraper struct {
Common commonMappedConfig `yaml:"common"`
Scene *mappedSceneScraperConfig `yaml:"scene"`
Gallery *mappedGalleryScraperConfig `yaml:"gallery"`
Image *mappedImageScraperConfig `yaml:"image"`
Performer *mappedPerformerScraperConfig `yaml:"performer"`
Movie *mappedMovieScraperConfig `yaml:"movie"`
}
@@ -1016,6 +1072,57 @@ func (s mappedScraper) scrapeScene(ctx context.Context, q mappedQuery) (*Scraped
return nil, nil
}
func (s mappedScraper) scrapeImage(ctx context.Context, q mappedQuery) (*ScrapedImage, error) {
var ret ScrapedImage
imageScraperConfig := s.Image
if imageScraperConfig == nil {
return nil, nil
}
imageMap := imageScraperConfig.mappedConfig
imagePerformersMap := imageScraperConfig.Performers
imageTagsMap := imageScraperConfig.Tags
imageStudioMap := imageScraperConfig.Studio
logger.Debug(`Processing image:`)
results := imageMap.process(ctx, q, s.Common)
// now apply the performers and tags
if imagePerformersMap != nil {
logger.Debug(`Processing image performers:`)
ret.Performers = processRelationships[models.ScrapedPerformer](ctx, s, imagePerformersMap, q)
}
if imageTagsMap != nil {
logger.Debug(`Processing image tags:`)
ret.Tags = processRelationships[models.ScrapedTag](ctx, s, imageTagsMap, q)
}
if imageStudioMap != nil {
logger.Debug(`Processing image studio:`)
studioResults := imageStudioMap.process(ctx, q, s.Common)
if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
studioResults[0].apply(studio)
ret.Studio = studio
}
}
// if no basic fields are populated, and no relationships, then return nil
if len(results) == 0 && len(ret.Performers) == 0 && len(ret.Tags) == 0 && ret.Studio == nil {
return nil, nil
}
if len(results) > 0 {
results[0].apply(&ret)
}
return &ret, nil
}
func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*ScrapedGallery, error) {
var ret ScrapedGallery

View File

@@ -33,6 +33,12 @@ func (c Cache) postScrape(ctx context.Context, content ScrapedContent) (ScrapedC
}
case ScrapedGallery:
return c.postScrapeGallery(ctx, v)
case *ScrapedImage:
if v != nil {
return c.postScrapeImage(ctx, *v)
}
case ScrapedImage:
return c.postScrapeImage(ctx, v)
case *models.ScrapedMovie:
if v != nil {
return c.postScrapeMovie(ctx, *v)
@@ -315,6 +321,40 @@ func (c Cache) postScrapeGallery(ctx context.Context, g ScrapedGallery) (Scraped
return g, nil
}
func (c Cache) postScrapeImage(ctx context.Context, image ScrapedImage) (ScrapedContent, error) {
r := c.repository
if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
pqb := r.PerformerFinder
tqb := r.TagFinder
sqb := r.StudioFinder
for _, p := range image.Performers {
if err := match.ScrapedPerformer(ctx, pqb, p, nil); err != nil {
return err
}
}
tags, err := postProcessTags(ctx, tqb, image.Tags)
if err != nil {
return err
}
image.Tags = tags
if image.Studio != nil {
err := match.ScrapedStudio(ctx, sqb, image.Studio, nil)
if err != nil {
return err
}
}
return nil
}); err != nil {
return nil, err
}
return image, nil
}
func postProcessTags(ctx context.Context, tqb models.TagQueryer, scrapedTags []*models.ScrapedTag) ([]*models.ScrapedTag, error) {
var ret []*models.ScrapedTag

View File

@@ -73,6 +73,24 @@ func queryURLParametersFromGallery(gallery *models.Gallery) queryURLParameters {
return ret
}
func queryURLParametersFromImage(image *models.Image) queryURLParameters {
ret := make(queryURLParameters)
ret["checksum"] = image.Checksum
if image.Path != "" {
ret["filename"] = filepath.Base(image.Path)
}
if image.Title != "" {
ret["title"] = image.Title
}
if len(image.URLs.List()) > 0 {
ret["url"] = image.URLs.List()[0]
}
return ret
}
func (p queryURLParameters) applyReplacements(r queryURLReplacements) {
for k, v := range p {
rpl, found := r[k]

View File

@@ -36,6 +36,7 @@ const (
ScrapeContentTypeGroup ScrapeContentType = "GROUP"
ScrapeContentTypePerformer ScrapeContentType = "PERFORMER"
ScrapeContentTypeScene ScrapeContentType = "SCENE"
ScrapeContentTypeImage ScrapeContentType = "IMAGE"
)
var AllScrapeContentType = []ScrapeContentType{
@@ -44,11 +45,12 @@ var AllScrapeContentType = []ScrapeContentType{
ScrapeContentTypeGroup,
ScrapeContentTypePerformer,
ScrapeContentTypeScene,
ScrapeContentTypeImage,
}
func (e ScrapeContentType) IsValid() bool {
switch e {
case ScrapeContentTypeGallery, ScrapeContentTypeMovie, ScrapeContentTypeGroup, ScrapeContentTypePerformer, ScrapeContentTypeScene:
case ScrapeContentTypeGallery, ScrapeContentTypeMovie, ScrapeContentTypeGroup, ScrapeContentTypePerformer, ScrapeContentTypeScene, ScrapeContentTypeImage:
return true
}
return false
@@ -84,6 +86,8 @@ type Scraper struct {
Scene *ScraperSpec `json:"scene"`
// Details for gallery scraper
Gallery *ScraperSpec `json:"gallery"`
// Details for image scraper
Image *ScraperSpec `json:"image"`
// Details for movie scraper
Group *ScraperSpec `json:"group"`
// Details for movie scraper
@@ -161,6 +165,7 @@ type Input struct {
Performer *ScrapedPerformerInput
Scene *ScrapedSceneInput
Gallery *ScrapedGalleryInput
Image *ScrapedImageInput
}
// populateURL populates the URL field of the input based on the
@@ -225,6 +230,14 @@ type sceneScraper interface {
viaScene(ctx context.Context, client *http.Client, scene *models.Scene) (*ScrapedScene, error)
}
// imageScraper is a scraper which supports image scrapes with
// image data as the input.
type imageScraper interface {
scraper
viaImage(ctx context.Context, client *http.Client, image *models.Image) (*ScrapedImage, error)
}
// galleryScraper is a scraper which supports gallery scrapes with
// gallery data as the input.
type galleryScraper interface {

View File

@@ -388,6 +388,10 @@ func (s *scriptScraper) scrape(ctx context.Context, input string, ty ScrapeConte
var movie *models.ScrapedMovie
err := s.runScraperScript(ctx, input, &movie)
return movie, err
case ScrapeContentTypeImage:
var image *ScrapedImage
err := s.runScraperScript(ctx, input, &image)
return image, err
}
return nil, ErrNotSupported
@@ -421,6 +425,20 @@ func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mod
return ret, err
}
func (s *scriptScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*ScrapedImage, error) {
inString, err := json.Marshal(imageToUpdateInput(image))
if err != nil {
return nil, err
}
var ret *ScrapedImage
err = s.runScraperScript(ctx, string(inString), &ret)
return ret, err
}
func handleScraperStderr(name string, scraperOutputReader io.ReadCloser) {
const scraperPrefix = "[Scrape / %s] "

View File

@@ -388,6 +388,33 @@ func (s *stashScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
return &ret, nil
}
func (s *stashScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*ScrapedImage, error) {
return nil, ErrNotSupported
}
func (s *stashScraper) scrapeByURL(_ context.Context, _ string, _ ScrapeContentType) (ScrapedContent, error) {
return nil, ErrNotSupported
}
func imageToUpdateInput(gallery *models.Image) models.ImageUpdateInput {
dateToStringPtr := func(s *models.Date) *string {
if s != nil {
v := s.String()
return &v
}
return nil
}
// fallback to file basename if title is empty
title := gallery.GetTitle()
urls := gallery.URLs.List()
return models.ImageUpdateInput{
ID: strconv.Itoa(gallery.ID),
Title: &title,
Details: &gallery.Details,
Urls: urls,
Date: dateToStringPtr(gallery.Date),
}
}

View File

@@ -83,6 +83,12 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon
return nil, err
}
return ret, nil
case ScrapeContentTypeImage:
ret, err := scraper.scrapeImage(ctx, q)
if err != nil || ret == nil {
return nil, err
}
return ret, nil
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
ret, err := scraper.scrapeGroup(ctx, q)
if err != nil || ret == nil {
@@ -228,6 +234,30 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
return scraper.scrapeGallery(ctx, q)
}
func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*ScrapedImage, error) {
// construct the URL
queryURL := queryURLParametersFromImage(image)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(ctx, url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeImage(ctx, q)
}
func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, error) {
r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig)
if err != nil {