scraper/mapped: Add feetToCm post process. (#711)

This patch adds a feetToCm post process that converts imperial feet and
inches to centimeters.
This commit is contained in:
woodgen
2020-08-12 03:17:43 +02:00
committed by GitHub
parent 551c13bbc8
commit e3ea3ea85e
4 changed files with 79 additions and 29 deletions

View File

@@ -3,8 +3,10 @@ package scraper
import ( import (
"errors" "errors"
"fmt" "fmt"
"math"
"reflect" "reflect"
"regexp" "regexp"
"strconv"
"strings" "strings"
"time" "time"
@@ -328,11 +330,36 @@ func (p *postProcessMap) Apply(value string, q mappedQuery) string {
return value return value
} }
type postProcessFeetToCm bool
func (p *postProcessFeetToCm) Apply(value string, q mappedQuery) string {
const foot_in_cm = 30.48
const inch_in_cm = 2.54
reg := regexp.MustCompile("[0-9]+")
filtered := reg.FindAllString(value, -1)
var feet float64
var inches float64
if len(filtered) > 0 {
feet, _ = strconv.ParseFloat(filtered[0], 64)
}
if len(filtered) > 1 {
inches, _ = strconv.ParseFloat(filtered[1], 64)
}
var centimeters = feet*foot_in_cm + inches*inch_in_cm
// Return rounded integer string
return strconv.Itoa(int(math.Round(centimeters)))
}
type mappedPostProcessAction struct { type mappedPostProcessAction struct {
ParseDate string `yaml:"parseDate"` ParseDate string `yaml:"parseDate"`
Replace mappedRegexConfigs `yaml:"replace"` Replace mappedRegexConfigs `yaml:"replace"`
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"` SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
Map map[string]string `yaml:"map"` Map map[string]string `yaml:"map"`
FeetToCm bool `yaml:"feetToCm"`
} }
func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error) { func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error) {
@@ -368,6 +395,14 @@ func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error
action := postProcessMap(a.Map) action := postProcessMap(a.Map)
ret = &action ret = &action
} }
if a.FeetToCm {
if found != "" {
return nil, fmt.Errorf("post-process actions must have a single field, found %s and %s", found, "feetToCm")
}
found = "feetToCm"
action := postProcessFeetToCm(a.FeetToCm)
ret = &action
}
if ret == nil { if ret == nil {
return nil, errors.New("invalid post-process action") return nil, errors.New("invalid post-process action")

View File

@@ -3,6 +3,7 @@ package scraper
import ( import (
"testing" "testing"
"github.com/stretchr/testify/assert"
"gopkg.in/yaml.v2" "gopkg.in/yaml.v2"
) )
@@ -29,3 +30,31 @@ xPathScrapers:
return return
} }
} }
type feetToCMTest struct {
in string
out string
}
var feetToCMTests = []feetToCMTest{
{"", "0"},
{"a", "0"},
{"6", "183"},
{"6 feet", "183"},
{"6ft0", "183"},
{"6ft2", "188"},
{"6'2\"", "188"},
{"6.2", "188"},
{"6ft2.99", "188"},
{"text6other2", "188"},
}
func TestFeetToCM(t *testing.T) {
pp := postProcessFeetToCm(true)
q := &xpathQuery{}
for _, test := range feetToCMTests {
assert.Equal(t, test.out, pp.Apply(test.in, q))
}
}

View File

@@ -97,28 +97,7 @@ const htmlDoc1 = `
<b>Height:</b> <b>Height:</b>
</td> </td>
<td class="paramvalue"> <td class="paramvalue">
<script type="text/javascript"> 5ft7
<!--
heightcm = "171";
morethenone = 'inch';
feet = heightcm / 30.48;
inches = (feet - Math.floor(feet)) * 30.48 / 2.54;
feet = Math.floor(feet);
inches = inches.toFixed(0);
if (inches > 1) {
morethenone = 'inches';
}
if (heightcm == 0) {
message = 'Unknown';
} else {
message = '171 cm - ' + feet + ' feet and ' + inches + ' ' + morethenone;
}
document.write(message);
// -->
</script>&nbsp;
</td> </td>
</tr> </tr>
<tr> <tr>
@@ -209,12 +188,10 @@ func makeXPathConfig() mappedPerformerScraperConfig {
config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`) config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`)
config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:")) config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:"))
config.mappedConfig["Country"] = makeSimpleAttrConfig(makeCommonXPath("Country of Origin:"))
config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:")) config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:"))
config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:")) config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:"))
config.mappedConfig["Measurements"] = makeSimpleAttrConfig(makeCommonXPath("Measurements:")) config.mappedConfig["Measurements"] = makeSimpleAttrConfig(makeCommonXPath("Measurements:"))
config.mappedConfig["FakeTits"] = makeSimpleAttrConfig(makeCommonXPath("Fake boobs:")) config.mappedConfig["FakeTits"] = makeSimpleAttrConfig(makeCommonXPath("Fake boobs:"))
config.mappedConfig["Height"] = makeSimpleAttrConfig(makeCommonXPath("Height:"))
config.mappedConfig["Tattoos"] = makeSimpleAttrConfig(makeCommonXPath("Tattoos:")) config.mappedConfig["Tattoos"] = makeSimpleAttrConfig(makeCommonXPath("Tattoos:"))
config.mappedConfig["Piercings"] = makeSimpleAttrConfig(makeCommonXPath("Piercings:")) config.mappedConfig["Piercings"] = makeSimpleAttrConfig(makeCommonXPath("Piercings:"))
@@ -257,10 +234,17 @@ func makeXPathConfig() mappedPerformerScraperConfig {
config.mappedConfig["Gender"] = genderConfig config.mappedConfig["Gender"] = genderConfig
// use fixed for height // use fixed for height
config.mappedConfig["Height"] = mappedScraperAttrConfig{ config.mappedConfig["Country"] = mappedScraperAttrConfig{
Fixed: "1234", Fixed: "United States",
} }
heightConfig := makeSimpleAttrConfig(makeCommonXPath("Height:"))
heightConvAction := postProcessFeetToCm(true)
heightConfig.postProcessActions = []postProcessAction{
&heightConvAction,
}
config.mappedConfig["Height"] = heightConfig
return config return config
} }
@@ -313,7 +297,7 @@ func TestScrapePerformerXPath(t *testing.T) {
const careerLength = "2012 - 2019" const careerLength = "2012 - 2019"
const tattoosPiercings = "None" const tattoosPiercings = "None"
const gender = "Female" const gender = "Female"
const height = "1234" const height = "170"
verifyField(t, performerName, performer.Name, "Name") verifyField(t, performerName, performer.Name, "Name")
verifyField(t, gender, performer.Gender, "Gender") verifyField(t, gender, performer.Gender, "Gender")
@@ -331,7 +315,7 @@ func TestScrapePerformerXPath(t *testing.T) {
verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos") verifyField(t, tattoosPiercings, performer.Tattoos, "Tattoos")
verifyField(t, tattoosPiercings, performer.Piercings, "Piercings") verifyField(t, tattoosPiercings, performer.Piercings, "Piercings")
verifyField(t, height, performer.Height, "Piercings") verifyField(t, height, performer.Height, "Height")
} }
func TestConcatXPath(t *testing.T) { func TestConcatXPath(t *testing.T) {

View File

@@ -289,6 +289,7 @@ The `Measurements` xpath string will replace `$infoPiece` with `//div[@class="in
##### Post-processing options ##### Post-processing options
Post-processing operations are contained in the `postProcess` key. Post-processing operations are performed in the order they are specified. The following post-processing operations are available: Post-processing operations are contained in the `postProcess` key. Post-processing operations are performed in the order they are specified. The following post-processing operations are available:
* `feetToCm`: converts a string containing feet and inches numbers into centimetres. Looks for up to two separate integers and interprets the first as the number of feet, and the second as the number of inches. The numbers can be separated by any non-numeric character including the `.` character. It does not handle decimal numbers. For example `6.3` and `6ft3.3` would both be interpreted as 6 feet, 3 inches before converting into centimetres.
* `map`: contains a map of input values to output values. Where a value matches one of the input values, it is replaced with the matching output value. If no value is matched, then value is unmodified. * `map`: contains a map of input values to output values. Where a value matches one of the input values, it is replaced with the matching output value. If no value is matched, then value is unmodified.
Example: Example:
@@ -303,6 +304,8 @@ performer:
``` ```
Gets the contents of the selected div element, and sets the returned value to `Female` if the scraped value is `F`; `Male` if the scraped value is `M`. Gets the contents of the selected div element, and sets the returned value to `Female` if the scraped value is `F`; `Male` if the scraped value is `M`.
* `parseDate`: if present, the value is the date format using go's reference date (2006-01-02). For example, if an example date was `14-Mar-2003`, then the date format would be `02-Jan-2006`. See the [time.Parse documentation](https://golang.org/pkg/time/#Parse) for details. When present, the scraper will convert the input string into a date, then convert it to the string format used by stash (`YYYY-MM-DD`).
* `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `$1` is the first capture group, `$2` the second and so on. Replacements are performed in order of the array. * `replace`: contains an array of sub-objects. Each sub-object must have a `regex` and `with` field. The `regex` field is the regex pattern to replace, and `with` is the string to replace it with. `$` is used to reference capture groups - `$1` is the first capture group, `$2` the second and so on. Replacements are performed in order of the array.
Example: Example:
@@ -317,7 +320,6 @@ CareerLength:
Replaces `2001 to 2003` with `2001-2003`. Replaces `2001 to 2003` with `2001-2003`.
* `subScraper`: if present, the sub-scraper will be executed after all other post-processes are complete and before parseDate. It then takes the value and performs an http request, using the value as the URL. Within the `subScraper` config is a nested scraping configuration. This allows you to traverse to other webpages to get the attribute value you are after. For more info and examples have a look at [#370](https://github.com/stashapp/stash/pull/370), [#606](https://github.com/stashapp/stash/pull/606) * `subScraper`: if present, the sub-scraper will be executed after all other post-processes are complete and before parseDate. It then takes the value and performs an http request, using the value as the URL. Within the `subScraper` config is a nested scraping configuration. This allows you to traverse to other webpages to get the attribute value you are after. For more info and examples have a look at [#370](https://github.com/stashapp/stash/pull/370), [#606](https://github.com/stashapp/stash/pull/606)
* `parseDate`: if present, the value is the date format using go's reference date (2006-01-02). For example, if an example date was `14-Mar-2003`, then the date format would be `02-Jan-2006`. See the [time.Parse documentation](https://golang.org/pkg/time/#Parse) for details. When present, the scraper will convert the input string into a date, then convert it to the string format used by stash (`YYYY-MM-DD`).
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations: Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together * `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together