This commit is contained in:
2025-10-25 03:02:53 +03:00
commit 043225d523
3416 changed files with 681196 additions and 0 deletions

142
pulldraft.py Normal file
View File

@@ -0,0 +1,142 @@
from pathlib import Path
CURRENT_SCRIPT_DIR: Path = Path(__file__).parent
DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft"
DRAFT_DIR.mkdir(exist_ok=True)
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from html_to_markdown import convert, ConversionOptions
import queue
import threading
import requests
from rich.progress import (
Progress,
BarColumn,
TextColumn,
TimeRemainingColumn,
TimeElapsedColumn,
MofNCompleteColumn,
SpinnerColumn,
TransferSpeedColumn,
ProgressColumn,
Task,
)
from rich.console import Console, RenderableType
from rich.panel import Panel
from rich.text import Text
from rich.table import Table
from rich import box
conversion_options: ConversionOptions = ConversionOptions(
heading_style="atx",
list_indent_type="spaces",
list_indent_width=2,
code_block_style="backticks",
autolinks=True,
highlight_style="double-equal"
)
session = requests.Session()
console = Console()
BASE_URL: str = "https://eel.is/c++draft"
lock = threading.Lock()
url_queue = queue.Queue()
def get_page_and_add_urls(url_part: str) -> str:
res = set()
response = session.get(f"{BASE_URL}/{url_part}")
response.raise_for_status()
html = response.text
bs = BeautifulSoup(html, "html.parser")
for search_class in ["abbr_ref", "floded_abbr_ref"]:
for href in bs.find_all("a", class_=search_class):
url = str(href.attrs["href"])
# Нас интересует только основной раздел который идет до первой точки
# parts = url.split(".")
# if len(parts) >= 1:
# res.add(parts[0])
url_queue.put(url)
for url in res:
url_queue.put(url)
return html
def convert_html_to_markdown(html: str, url_part: str):
md = convert(html, options=conversion_options)
save_path = DRAFT_DIR / f"{url_part}.md"
subpath = ""
filename: str = url_part
if "." in url_part:
parts = url_part.split(".")
save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve()
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_text(md, encoding="utf-8")
def pull_and_save(url_part: str):
try:
response = session.get(f"{BASE_URL}/{url_part}")
response.raise_for_status()
html = response.text
convert_html_to_markdown(html, url_part)
except Exception as e:
pass
# Первичное получение ссылок
# Главная страница визуально содержит небольшое меню
# Однако в html там есть ссылки на все разделы
convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft")
try:
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
TransferSpeedColumn(),
console=console,
) as progress:
task_id = progress.add_task(
"[cyan]Получение страниц...", total=url_queue.qsize()
)
with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor:
futures = []
while not url_queue.empty():
url = url_queue.get()
future = executor.submit(
pull_and_save,
url
)
futures.append(future)
for future in as_completed(futures):
try:
future.result()
progress.advance(task_id)
except Exception as e:
console.print(e)
except Exception as e:
console.print(e)