Allow MetadataFactory to recover from bad HTML files

This change allows MetadataFactory.iterate_folders to recognize when
MetadataFactory has retrieved a malformed HTML file, or an HTML file
that does not contain the table and rows that it expects to see.

If the URL is correct, it should retrieve the right page, but this is
not guaranteed, and MetadataFactory should be able to recover and
generate an appropriate error message.

One possible situation where this kind of error might occur could be
when the user is connected to a public WIFI network, but not logged in.
The router would redirect aqt to the network login page, and aqt would
have no way of knowing that it did not receive the correct page.
Aqt should be resilient to this kind of error.
This commit is contained in:
Dave Dalcino
2022-04-19 18:26:21 -07:00
parent 320df539c0
commit e8672f3fbe
3 changed files with 56 additions and 16 deletions

View File

@@ -40,6 +40,12 @@ class AqtException(Exception):
["* " + suggestion for suggestion in self.suggested_action]
)
def append_suggested_follow_up(self, suggestions: Iterable[str]):
if self.suggested_action:
self.suggested_action = [*self.suggested_action, *suggestions]
else:
self.suggested_action = suggestions
class ArchiveDownloadError(AqtException):
pass

View File

@@ -450,7 +450,7 @@ class MetadataFactory:
return arches
def fetch_extensions(self, version: Version) -> List[str]:
versions_extensions = MetadataFactory.get_versions_extensions(
versions_extensions = self.get_versions_extensions(
self.fetch_http(self.archive_id.to_url(), False), self.archive_id.category
)
filtered = filter(
@@ -467,7 +467,7 @@ class MetadataFactory:
def get_version(ver_ext: Tuple[Version, str]):
return ver_ext[0]
versions_extensions = MetadataFactory.get_versions_extensions(
versions_extensions = self.get_versions_extensions(
self.fetch_http(self.archive_id.to_url(), False), self.archive_id.category
)
versions = sorted(filter(None, map(get_version, filter(filter_by, versions_extensions))))
@@ -479,7 +479,7 @@ class MetadataFactory:
def fetch_tools(self) -> List[str]:
html_doc = self.fetch_http(self.archive_id.to_url(), False)
return list(MetadataFactory.iterate_folders(html_doc, "tools"))
return list(self.iterate_folders(html_doc, "tools"))
def fetch_tool_modules(self, tool_name: str) -> List[str]:
tool_data = self._fetch_module_metadata(tool_name)
@@ -588,24 +588,32 @@ class MetadataFactory:
f"Connection to '{base_url}' failed. Retrying with fallback '{base_urls[i + 1]}'."
)
@staticmethod
def iterate_folders(html_doc: str, filter_category: str = "") -> Generator[str, None, None]:
def iterate_folders(self, html_doc: str, filter_category: str = "") -> Generator[str, None, None]:
def table_row_to_folder(tr: bs4.element.Tag) -> str:
try:
return tr.find_all("td")[1].a.contents[0].rstrip("/")
except (AttributeError, IndexError):
return ""
soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser")
for row in soup.body.table.find_all("tr"):
content: str = table_row_to_folder(row)
if not content or content == "Parent Directory":
continue
if content.startswith(filter_category):
yield content
try:
soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser")
for row in soup.body.table.find_all("tr"):
content: str = table_row_to_folder(row)
if not content or content == "Parent Directory":
continue
if content.startswith(filter_category):
yield content
except Exception as e:
url = posixpath.join(Settings.baseurl, self.archive_id.to_url())
raise ArchiveConnectionError(
f"Failed to retrieve the expected HTML page at {url}",
suggested_action=[
"Check your network connection.",
f"Make sure that you can access {url} in your web browser.",
],
) from e
@staticmethod
def get_versions_extensions(html_doc: str, category: str) -> Iterator[Tuple[Optional[Version], str]]:
def get_versions_extensions(self, html_doc: str, category: str) -> Iterator[Tuple[Optional[Version], str]]:
def folder_to_version_extension(folder: str) -> Tuple[Optional[Version], str]:
components = folder.split("_", maxsplit=2)
ext = "" if len(components) < 3 else components[2]
@@ -617,7 +625,7 @@ class MetadataFactory:
return map(
folder_to_version_extension,
MetadataFactory.iterate_folders(html_doc, category),
self.iterate_folders(html_doc, category),
)
@staticmethod
@@ -792,5 +800,5 @@ def show_list(meta: MetadataFactory):
else:
print(*output, sep=" ")
except (ArchiveDownloadError, ArchiveConnectionError) as e:
e.suggested_action = suggested_follow_up(meta)
e.append_suggested_follow_up(suggested_follow_up(meta))
raise e from e

View File

@@ -1,6 +1,7 @@
import hashlib
import json
import os
import posixpath
import re
import shutil
import sys
@@ -178,6 +179,31 @@ def test_list_versions_tools(monkeypatch, spec_regex, os_name, target, in_file,
assert f"{all_ver_for_spec}" == row
@pytest.mark.parametrize(
"html_doc",
(
"<html><body>Login to my public WIFI network:<form>...</form></body></html>",
"<html>malformed-html/",
),
)
def test_list_bad_html(monkeypatch, html_doc: str):
monkeypatch.setattr(MetadataFactory, "fetch_http", lambda *args, **kwargs: html_doc)
archive_id = ArchiveId("qt", "linux", "desktop")
expected_url = posixpath.join(Settings.baseurl, archive_id.to_url())
expected_exception = ArchiveConnectionError(
f"Failed to retrieve the expected HTML page at {expected_url}",
suggested_action=[
"Check your network connection.",
f"Make sure that you can access {expected_url} in your web browser.",
],
)
with pytest.raises(ArchiveConnectionError) as e:
MetadataFactory(archive_id).fetch_versions()
assert e.type == ArchiveConnectionError
assert format(e.value) == format(expected_exception)
@pytest.mark.parametrize(
"version,extension,in_file,expect_out_file",
[