Files
cppdraft_translate/pulldraft.py
2025-10-25 03:02:53 +03:00

143 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pathlib import Path
CURRENT_SCRIPT_DIR: Path = Path(__file__).parent
DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft"
DRAFT_DIR.mkdir(exist_ok=True)
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from html_to_markdown import convert, ConversionOptions
import queue
import threading
import requests
from rich.progress import (
Progress,
BarColumn,
TextColumn,
TimeRemainingColumn,
TimeElapsedColumn,
MofNCompleteColumn,
SpinnerColumn,
TransferSpeedColumn,
ProgressColumn,
Task,
)
from rich.console import Console, RenderableType
from rich.panel import Panel
from rich.text import Text
from rich.table import Table
from rich import box
conversion_options: ConversionOptions = ConversionOptions(
heading_style="atx",
list_indent_type="spaces",
list_indent_width=2,
code_block_style="backticks",
autolinks=True,
highlight_style="double-equal"
)
session = requests.Session()
console = Console()
BASE_URL: str = "https://eel.is/c++draft"
lock = threading.Lock()
url_queue = queue.Queue()
def get_page_and_add_urls(url_part: str) -> str:
res = set()
response = session.get(f"{BASE_URL}/{url_part}")
response.raise_for_status()
html = response.text
bs = BeautifulSoup(html, "html.parser")
for search_class in ["abbr_ref", "floded_abbr_ref"]:
for href in bs.find_all("a", class_=search_class):
url = str(href.attrs["href"])
# Нас интересует только основной раздел который идет до первой точки
# parts = url.split(".")
# if len(parts) >= 1:
# res.add(parts[0])
url_queue.put(url)
for url in res:
url_queue.put(url)
return html
def convert_html_to_markdown(html: str, url_part: str):
md = convert(html, options=conversion_options)
save_path = DRAFT_DIR / f"{url_part}.md"
subpath = ""
filename: str = url_part
if "." in url_part:
parts = url_part.split(".")
save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve()
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_text(md, encoding="utf-8")
def pull_and_save(url_part: str):
try:
response = session.get(f"{BASE_URL}/{url_part}")
response.raise_for_status()
html = response.text
convert_html_to_markdown(html, url_part)
except Exception as e:
pass
# Первичное получение ссылок
# Главная страница визуально содержит небольшое меню
# Однако в html там есть ссылки на все разделы
convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft")
try:
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
TransferSpeedColumn(),
console=console,
) as progress:
task_id = progress.add_task(
"[cyan]Получение страниц...", total=url_queue.qsize()
)
with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor:
futures = []
while not url_queue.empty():
url = url_queue.get()
future = executor.submit(
pull_and_save,
url
)
futures.append(future)
for future in as_completed(futures):
try:
future.result()
progress.advance(task_id)
except Exception as e:
console.print(e)
except Exception as e:
console.print(e)