143 lines
3.9 KiB
Python
143 lines
3.9 KiB
Python
from pathlib import Path
|
||
|
||
|
||
CURRENT_SCRIPT_DIR: Path = Path(__file__).parent
|
||
DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft"
|
||
DRAFT_DIR.mkdir(exist_ok=True)
|
||
|
||
|
||
from bs4 import BeautifulSoup
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from html_to_markdown import convert, ConversionOptions
|
||
import queue
|
||
import threading
|
||
import requests
|
||
from rich.progress import (
|
||
Progress,
|
||
BarColumn,
|
||
TextColumn,
|
||
TimeRemainingColumn,
|
||
TimeElapsedColumn,
|
||
MofNCompleteColumn,
|
||
SpinnerColumn,
|
||
TransferSpeedColumn,
|
||
ProgressColumn,
|
||
Task,
|
||
)
|
||
from rich.console import Console, RenderableType
|
||
from rich.panel import Panel
|
||
from rich.text import Text
|
||
from rich.table import Table
|
||
from rich import box
|
||
|
||
|
||
conversion_options: ConversionOptions = ConversionOptions(
|
||
heading_style="atx",
|
||
list_indent_type="spaces",
|
||
list_indent_width=2,
|
||
code_block_style="backticks",
|
||
autolinks=True,
|
||
highlight_style="double-equal"
|
||
)
|
||
|
||
session = requests.Session()
|
||
console = Console()
|
||
|
||
BASE_URL: str = "https://eel.is/c++draft"
|
||
lock = threading.Lock()
|
||
|
||
url_queue = queue.Queue()
|
||
|
||
|
||
def get_page_and_add_urls(url_part: str) -> str:
|
||
res = set()
|
||
|
||
response = session.get(f"{BASE_URL}/{url_part}")
|
||
response.raise_for_status()
|
||
html = response.text
|
||
|
||
bs = BeautifulSoup(html, "html.parser")
|
||
|
||
for search_class in ["abbr_ref", "floded_abbr_ref"]:
|
||
for href in bs.find_all("a", class_=search_class):
|
||
url = str(href.attrs["href"])
|
||
# Нас интересует только основной раздел который идет до первой точки
|
||
# parts = url.split(".")
|
||
# if len(parts) >= 1:
|
||
# res.add(parts[0])
|
||
url_queue.put(url)
|
||
|
||
for url in res:
|
||
url_queue.put(url)
|
||
|
||
return html
|
||
|
||
|
||
def convert_html_to_markdown(html: str, url_part: str):
|
||
md = convert(html, options=conversion_options)
|
||
|
||
save_path = DRAFT_DIR / f"{url_part}.md"
|
||
|
||
subpath = ""
|
||
filename: str = url_part
|
||
if "." in url_part:
|
||
parts = url_part.split(".")
|
||
save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve()
|
||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
save_path.write_text(md, encoding="utf-8")
|
||
|
||
|
||
def pull_and_save(url_part: str):
|
||
try:
|
||
response = session.get(f"{BASE_URL}/{url_part}")
|
||
response.raise_for_status()
|
||
html = response.text
|
||
|
||
convert_html_to_markdown(html, url_part)
|
||
except Exception as e:
|
||
pass
|
||
|
||
|
||
# Первичное получение ссылок
|
||
# Главная страница визуально содержит небольшое меню
|
||
# Однако в html там есть ссылки на все разделы
|
||
convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft")
|
||
|
||
try:
|
||
with Progress(
|
||
SpinnerColumn(),
|
||
TextColumn("[progress.description]{task.description}"),
|
||
BarColumn(),
|
||
MofNCompleteColumn(),
|
||
TimeElapsedColumn(),
|
||
TransferSpeedColumn(),
|
||
console=console,
|
||
) as progress:
|
||
|
||
task_id = progress.add_task(
|
||
"[cyan]Получение страниц...", total=url_queue.qsize()
|
||
)
|
||
|
||
with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor:
|
||
futures = []
|
||
|
||
while not url_queue.empty():
|
||
url = url_queue.get()
|
||
future = executor.submit(
|
||
pull_and_save,
|
||
url
|
||
)
|
||
futures.append(future)
|
||
|
||
for future in as_completed(futures):
|
||
try:
|
||
future.result()
|
||
progress.advance(task_id)
|
||
except Exception as e:
|
||
console.print(e)
|
||
|
||
except Exception as e:
|
||
console.print(e)
|
||
|