From 0fc57ce1e0347134ef2f37d2cbefa4592f230d43 Mon Sep 17 00:00:00 2001 From: bnkai <48220860+bnkai@users.noreply.github.com> Date: Mon, 18 May 2020 05:26:20 +0300 Subject: [PATCH] Fix xpath comments text (#550) --- pkg/scraper/xpath.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index b3eb25777..c765101fc 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -164,7 +164,7 @@ func (c xpathScraperAttrConfig) concatenateResults(nodes []*html.Node) string { result := []string{} for _, elem := range nodes { - text := htmlquery.InnerText(elem) + text := NodeText(elem) text = commonPostProcess(text) result = append(result, text) @@ -220,7 +220,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string { if subScraper.hasConcat() { result = subScraper.concatenateResults(found) } else { - result = htmlquery.InnerText(found[0]) + result = NodeText(found[0]) result = commonPostProcess(result) } @@ -277,7 +277,7 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP if len(found) > 0 { for i, elem := range found { - text := htmlquery.InnerText(elem) + text := NodeText(elem) text = commonPostProcess(text) ret = ret.setKey(i, k, text) @@ -297,7 +297,7 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP ret = ret.setKey(i, k, result) } else { for i, elem := range found { - text := htmlquery.InnerText(elem) + text := NodeText(elem) text = commonPostProcess(text) text = attrConfig.postProcess(text) @@ -603,3 +603,10 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra return scraper.scrapePerformers(doc) } + +func NodeText(n *html.Node) string { + if n != nil && n.Type == html.CommentNode { + return htmlquery.OutputHTML(n, true) + } + return htmlquery.InnerText(n) +}