Merge pull request #518 from ddalcino/resilient-metadata-factory

Allow MetadataFactory to recover from bad HTML files
This commit is contained in:
Hiroshi Miura
2022-04-20 11:44:08 +09:00
committed by GitHub
3 changed files with 55 additions and 18 deletions

View File

@@ -18,14 +18,14 @@
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from typing import Iterable
from typing import List
DOCS_CONFIG = "https://aqtinstall.readthedocs.io/en/stable/configuration.html#configuration"
class AqtException(Exception):
def __init__(self, *args, **kwargs):
self.suggested_action: Iterable[str] = kwargs.pop("suggested_action", [])
self.suggested_action: List[str] = kwargs.pop("suggested_action", [])
self.should_show_help: bool = kwargs.pop("should_show_help", False)
super(AqtException, self).__init__(*args, **kwargs)
@@ -40,6 +40,9 @@ class AqtException(Exception):
["* " + suggestion for suggestion in self.suggested_action]
)
def append_suggested_follow_up(self, suggestions: List[str]):
self.suggested_action.extend(suggestions)
class ArchiveDownloadError(AqtException):
pass

View File

@@ -450,7 +450,7 @@ class MetadataFactory:
return arches
def fetch_extensions(self, version: Version) -> List[str]:
versions_extensions = MetadataFactory.get_versions_extensions(
versions_extensions = self.get_versions_extensions(
self.fetch_http(self.archive_id.to_url(), False), self.archive_id.category
)
filtered = filter(
@@ -467,7 +467,7 @@ class MetadataFactory:
def get_version(ver_ext: Tuple[Version, str]):
return ver_ext[0]
versions_extensions = MetadataFactory.get_versions_extensions(
versions_extensions = self.get_versions_extensions(
self.fetch_http(self.archive_id.to_url(), False), self.archive_id.category
)
versions = sorted(filter(None, map(get_version, filter(filter_by, versions_extensions))))
@@ -479,7 +479,7 @@ class MetadataFactory:
def fetch_tools(self) -> List[str]:
html_doc = self.fetch_http(self.archive_id.to_url(), False)
return list(MetadataFactory.iterate_folders(html_doc, "tools"))
return list(self.iterate_folders(html_doc, "tools"))
def fetch_tool_modules(self, tool_name: str) -> List[str]:
tool_data = self._fetch_module_metadata(tool_name)
@@ -588,24 +588,32 @@ class MetadataFactory:
f"Connection to '{base_url}' failed. Retrying with fallback '{base_urls[i + 1]}'."
)
@staticmethod
def iterate_folders(html_doc: str, filter_category: str = "") -> Generator[str, None, None]:
def iterate_folders(self, html_doc: str, filter_category: str = "") -> Generator[str, None, None]:
def table_row_to_folder(tr: bs4.element.Tag) -> str:
try:
return tr.find_all("td")[1].a.contents[0].rstrip("/")
except (AttributeError, IndexError):
return ""
soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser")
for row in soup.body.table.find_all("tr"):
content: str = table_row_to_folder(row)
if not content or content == "Parent Directory":
continue
if content.startswith(filter_category):
yield content
try:
soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser")
for row in soup.body.table.find_all("tr"):
content: str = table_row_to_folder(row)
if not content or content == "Parent Directory":
continue
if content.startswith(filter_category):
yield content
except Exception as e:
url = posixpath.join(Settings.baseurl, self.archive_id.to_url())
raise ArchiveConnectionError(
f"Failed to retrieve the expected HTML page at {url}",
suggested_action=[
"Check your network connection.",
f"Make sure that you can access {url} in your web browser.",
],
) from e
@staticmethod
def get_versions_extensions(html_doc: str, category: str) -> Iterator[Tuple[Optional[Version], str]]:
def get_versions_extensions(self, html_doc: str, category: str) -> Iterator[Tuple[Optional[Version], str]]:
def folder_to_version_extension(folder: str) -> Tuple[Optional[Version], str]:
components = folder.split("_", maxsplit=2)
ext = "" if len(components) < 3 else components[2]
@@ -617,7 +625,7 @@ class MetadataFactory:
return map(
folder_to_version_extension,
MetadataFactory.iterate_folders(html_doc, category),
self.iterate_folders(html_doc, category),
)
@staticmethod
@@ -792,5 +800,5 @@ def show_list(meta: MetadataFactory):
else:
print(*output, sep=" ")
except (ArchiveDownloadError, ArchiveConnectionError) as e:
e.suggested_action = suggested_follow_up(meta)
e.append_suggested_follow_up(suggested_follow_up(meta))
raise e from e

View File

@@ -1,6 +1,7 @@
import hashlib
import json
import os
import posixpath
import re
import shutil
import sys
@@ -178,6 +179,31 @@ def test_list_versions_tools(monkeypatch, spec_regex, os_name, target, in_file,
assert f"{all_ver_for_spec}" == row
@pytest.mark.parametrize(
"html_doc",
(
"<html><body>Login to my public WIFI network:<form>...</form></body></html>",
"<html>malformed-html/",
),
)
def test_list_bad_html(monkeypatch, html_doc: str):
monkeypatch.setattr(MetadataFactory, "fetch_http", lambda *args, **kwargs: html_doc)
archive_id = ArchiveId("qt", "linux", "desktop")
expected_url = posixpath.join(Settings.baseurl, archive_id.to_url())
expected_exception = ArchiveConnectionError(
f"Failed to retrieve the expected HTML page at {expected_url}",
suggested_action=[
"Check your network connection.",
f"Make sure that you can access {expected_url} in your web browser.",
],
)
with pytest.raises(ArchiveConnectionError) as e:
MetadataFactory(archive_id).fetch_versions()
assert e.type == ArchiveConnectionError
assert format(e.value) == format(expected_exception)
@pytest.mark.parametrize(
"version,extension,in_file,expect_out_file",
[