from pathlib import Path CURRENT_SCRIPT_DIR: Path = Path(__file__).parent DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft" DRAFT_DIR.mkdir(exist_ok=True) from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed from html_to_markdown import convert, ConversionOptions import queue import threading import requests from rich.progress import ( Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn, SpinnerColumn, TransferSpeedColumn, ProgressColumn, Task, ) from rich.console import Console, RenderableType from rich.panel import Panel from rich.text import Text from rich.table import Table from rich import box conversion_options: ConversionOptions = ConversionOptions( heading_style="atx", list_indent_type="spaces", list_indent_width=2, code_block_style="backticks", autolinks=True, highlight_style="double-equal" ) session = requests.Session() console = Console() BASE_URL: str = "https://eel.is/c++draft" lock = threading.Lock() url_queue = queue.Queue() def get_page_and_add_urls(url_part: str) -> str: res = set() response = session.get(f"{BASE_URL}/{url_part}") response.raise_for_status() html = response.text bs = BeautifulSoup(html, "html.parser") for search_class in ["abbr_ref", "floded_abbr_ref"]: for href in bs.find_all("a", class_=search_class): url = str(href.attrs["href"]) # Нас интересует только основной раздел который идет до первой точки # parts = url.split(".") # if len(parts) >= 1: # res.add(parts[0]) url_queue.put(url) for url in res: url_queue.put(url) return html def convert_html_to_markdown(html: str, url_part: str): md = convert(html, options=conversion_options) save_path = DRAFT_DIR / f"{url_part}.md" subpath = "" filename: str = url_part if "." in url_part: parts = url_part.split(".") save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve() save_path.parent.mkdir(parents=True, exist_ok=True) save_path.write_text(md, encoding="utf-8") def pull_and_save(url_part: str): try: response = session.get(f"{BASE_URL}/{url_part}") response.raise_for_status() html = response.text convert_html_to_markdown(html, url_part) except Exception as e: pass # Первичное получение ссылок # Главная страница визуально содержит небольшое меню # Однако в html там есть ссылки на все разделы convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft") try: with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), TimeElapsedColumn(), TransferSpeedColumn(), console=console, ) as progress: task_id = progress.add_task( "[cyan]Получение страниц...", total=url_queue.qsize() ) with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor: futures = [] while not url_queue.empty(): url = url_queue.get() future = executor.submit( pull_and_save, url ) futures.append(future) for future in as_completed(futures): try: future.result() progress.advance(task_id) except Exception as e: console.print(e) except Exception as e: console.print(e)