Expose url for URLReplace in JSON scrapeByURL and scrapeByFragment (#1150)

* Expose url for URLReplace in JSON scrapeByURL and scrapeByFragment
* Apply queryURLReplace to xpath scrapers

Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
bnkai
2021-03-02 00:19:56 +02:00
committed by GitHub
parent fe990e00c1
commit 117e6326db
5 changed files with 56 additions and 9 deletions

View File

@@ -52,7 +52,7 @@ func (s *jsonScraper) loadURL(url string) (string, error) {
if err != nil { if err != nil {
return "", err return "", err
} }
logger.Infof("loadURL (%s)\n", url)
doc, err := ioutil.ReadAll(r) doc, err := ioutil.ReadAll(r)
if err != nil { if err != nil {
return "", err return "", err
@@ -71,7 +71,8 @@ func (s *jsonScraper) loadURL(url string) (string, error) {
} }
func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) { func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -81,7 +82,8 @@ func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer
} }
func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) { func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -91,7 +93,8 @@ func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error)
} }
func (s *jsonScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) { func (s *jsonScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -101,7 +104,8 @@ func (s *jsonScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, er
} }
func (s *jsonScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) { func (s *jsonScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@@ -17,6 +17,13 @@ func queryURLParametersFromScene(scene *models.Scene) queryURLParameters {
ret["oshash"] = scene.OSHash.String ret["oshash"] = scene.OSHash.String
ret["filename"] = filepath.Base(scene.Path) ret["filename"] = filepath.Base(scene.Path)
ret["title"] = scene.Title.String ret["title"] = scene.Title.String
ret["url"] = scene.URL.String
return ret
}
func queryURLParameterFromURL(url string) queryURLParameters {
ret := make(queryURLParameters)
ret["url"] = url
return ret return ret
} }
@@ -28,6 +35,7 @@ func queryURLParametersFromGallery(gallery *models.Gallery) queryURLParameters {
ret["filename"] = filepath.Base(gallery.Path.String) ret["filename"] = filepath.Base(gallery.Path.String)
} }
ret["title"] = gallery.Title.String ret["title"] = gallery.Title.String
ret["url"] = gallery.URL.String
return ret return ret
} }
@@ -49,3 +57,14 @@ func (p queryURLParameters) constructURL(url string) string {
return ret return ret
} }
// replaceURL does a partial URL Replace ( only url parameter is used)
func replaceURL(url string, scraperConfig scraperTypeConfig) string {
u := url
queryURL := queryURLParameterFromURL(u)
if scraperConfig.QueryURLReplacements != nil {
queryURL.applyReplacements(scraperConfig.QueryURLReplacements)
u = queryURL.constructURL(scraperConfig.QueryURL)
}
return u
}

View File

@@ -52,7 +52,8 @@ func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error)
} }
func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) { func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -62,7 +63,8 @@ func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerforme
} }
func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) { func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for scene by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -72,7 +74,8 @@ func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error
} }
func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) { func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for gallery by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -82,7 +85,8 @@ func (s *xpathScraper) scrapeGalleryByURL(url string) (*models.ScrapedGallery, e
} }
func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) { func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) {
doc, scraper, err := s.scrapeURL(url) u := replaceURL(url, s.scraper) // allow a URL Replace for movie by URL queries
doc, scraper, err := s.scrapeURL(u)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@@ -1,4 +1,5 @@
### 🎨 Improvements ### 🎨 Improvements
* Add `url` field to `URLReplace`, and make `queryURLReplace` available when scraping by URL.
* Make logging format consistent across platforms and include full timestamp. * Make logging format consistent across platforms and include full timestamp.
* Remember gallery images view mode. * Remember gallery images view mode.
* Add option to skip checking of insecure SSL certificates when scraping. * Add option to skip checking of insecure SSL certificates when scraping.

View File

@@ -223,6 +223,7 @@ For `sceneByFragment`, the `queryURL` field must also be present. This field is
* `{oshash}` - the oshash of the scene * `{oshash}` - the oshash of the scene
* `{filename}` - the base filename of the scene * `{filename}` - the base filename of the scene
* `{title}` - the title of the scene * `{title}` - the title of the scene
* `{url}` - the url of the scene
These placeholder field values may be manipulated with regex replacements by adding a `queryURLReplace` section, containing a map of placeholder field to regex configuration which uses the same format as the `replace` post-process action covered below. These placeholder field values may be manipulated with regex replacements by adding a `queryURLReplace` section, containing a map of placeholder field to regex configuration which uses the same format as the `replace` post-process action covered below.
@@ -241,6 +242,24 @@ sceneByFragment:
The above configuration would scrape from the value of `queryURL`, replacing `{filename}` with the base filename of the scene, after it has been manipulated by the regex replacements. The above configuration would scrape from the value of `queryURL`, replacing `{filename}` with the base filename of the scene, after it has been manipulated by the regex replacements.
### scrapeXPath and scrapeJson use with `<scene|performer|gallery|movie>ByURL`
For `sceneByURL`, `performerByURL`, `galleryByURL` the `queryURL` can also be present if we want to use `queryURLReplace`. The functionality is the same as `sceneByFragment`, the only placeholder field available though is the `url`:
* `{url}` - the url of the scene/performer/gallery
```yaml
sceneByURL:
- action: scrapeJson
url:
- metartnetwork.com
scraper: sceneScraper
queryURL: "{url}"
queryURLReplace:
url:
- regex: '^(?:.+\.)?([^.]+)\.com/.+movie/(\d+)/(\w+)/?$'
with: https://www.$1.com/api/movie?name=$3&date=$2
```
### Stash ### Stash
A different stash server can be configured as a scraping source. This action applies only to `performerByName`, `performerByFragment`, and `sceneByFragment` types. This action requires that the top-level `stashServer` field is configured. A different stash server can be configured as a scraping source. This action applies only to `performerByName`, `performerByFragment`, and `sceneByFragment` types. This action requires that the top-level `stashServer` field is configured.