diff --git a/pkg/scraper/json.go b/pkg/scraper/json.go index b40b04e77..b7c68e86e 100644 --- a/pkg/scraper/json.go +++ b/pkg/scraper/json.go @@ -52,7 +52,7 @@ func (s *jsonScraper) loadURL(url string) (string, error) { if err != nil { return "", err } - + logger.Infof("loadURL (%s)\n", url) doc, err := ioutil.ReadAll(r) if err != nil { return "", err @@ -71,7 +71,8 @@ func (s *jsonScraper) loadURL(url string) (string, error) { } func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -81,7 +82,8 @@ func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer } func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -91,7 +93,8 @@ func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) } func (s *jsonScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -101,7 +104,8 @@ func (s *jsonScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, er } func (s *jsonScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } diff --git a/pkg/scraper/query_url.go b/pkg/scraper/query_url.go index 517df5ac2..462069d2f 100644 --- a/pkg/scraper/query_url.go +++ b/pkg/scraper/query_url.go @@ -17,6 +17,13 @@ func queryURLParametersFromScene(scene *models.Scene) queryURLParameters { ret["oshash"] = scene.OSHash.String ret["filename"] = filepath.Base(scene.Path) ret["title"] = scene.Title.String + ret["url"] = scene.URL.String + return ret +} + +func queryURLParameterFromURL(url string) queryURLParameters { + ret := make(queryURLParameters) + ret["url"] = url return ret } @@ -28,6 +35,7 @@ func queryURLParametersFromGallery(gallery *models.Gallery) queryURLParameters { ret["filename"] = filepath.Base(gallery.Path.String) } ret["title"] = gallery.Title.String + ret["url"] = gallery.URL.String return ret } @@ -49,3 +57,14 @@ func (p queryURLParameters) constructURL(url string) string { return ret } + +// replaceURL does a partial URL Replace ( only url parameter is used) +func replaceURL(url string, scraperConfig scraperTypeConfig) string { + u := url + queryURL := queryURLParameterFromURL(u) + if scraperConfig.QueryURLReplacements != nil { + queryURL.applyReplacements(scraperConfig.QueryURLReplacements) + u = queryURL.constructURL(scraperConfig.QueryURL) + } + return u +} diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index f4f55bcdc..e612b5f4d 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -52,7 +52,8 @@ func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) } func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -62,7 +63,8 @@ func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerforme } func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -72,7 +74,8 @@ func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error } func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } @@ -82,7 +85,8 @@ func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, e } func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) { - doc, scraper, err := s.scrapeURL(url) + u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries + doc, scraper, err := s.scrapeURL(u) if err != nil { return nil, err } diff --git a/ui/v2.5/src/components/Changelog/versions/v060.md b/ui/v2.5/src/components/Changelog/versions/v060.md index d2b6099f3..b7234ecf7 100644 --- a/ui/v2.5/src/components/Changelog/versions/v060.md +++ b/ui/v2.5/src/components/Changelog/versions/v060.md @@ -1,4 +1,5 @@ ### 🎨 Improvements +* Add `url` field to `URLReplace`, and make `queryURLReplace` available when scraping by URL. * Make logging format consistent across platforms and include full timestamp. * Remember gallery images view mode. * Add option to skip checking of insecure SSL certificates when scraping. diff --git a/ui/v2.5/src/docs/en/Scraping.md b/ui/v2.5/src/docs/en/Scraping.md index 3c9cd2692..59065f84a 100644 --- a/ui/v2.5/src/docs/en/Scraping.md +++ b/ui/v2.5/src/docs/en/Scraping.md @@ -223,6 +223,7 @@ For `sceneByFragment`, the `queryURL` field must also be present. This field is * `{oshash}` - the oshash of the scene * `{filename}` - the base filename of the scene * `{title}` - the title of the scene +* `{url}` - the url of the scene These placeholder field values may be manipulated with regex replacements by adding a `queryURLReplace` section, containing a map of placeholder field to regex configuration which uses the same format as the `replace` post-process action covered below. @@ -241,6 +242,24 @@ sceneByFragment: The above configuration would scrape from the value of `queryURL`, replacing `{filename}` with the base filename of the scene, after it has been manipulated by the regex replacements. +### scrapeXPath and scrapeJson use with `ByURL` + +For `sceneByURL`, `performerByURL`, `galleryByURL` the `queryURL` can also be present if we want to use `queryURLReplace`. The functionality is the same as `sceneByFragment`, the only placeholder field available though is the `url`: +* `{url}` - the url of the scene/performer/gallery + +```yaml +sceneByURL: + - action: scrapeJson + url: + - metartnetwork.com + scraper: sceneScraper + queryURL: "{url}" + queryURLReplace: + url: + - regex: '^(?:.+\.)?([^.]+)\.com/.+movie/(\d+)/(\w+)/?$' + with: https://www.$1.com/api/movie?name=$3&date=$2 +``` + ### Stash A different stash server can be configured as a scraping source. This action applies only to `performerByName`, `performerByFragment`, and `sceneByFragment` types. This action requires that the top-level `stashServer` field is configured.