Init
This commit is contained in:
142
pulldraft.py
Normal file
142
pulldraft.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
CURRENT_SCRIPT_DIR: Path = Path(__file__).parent
|
||||
DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft"
|
||||
DRAFT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from html_to_markdown import convert, ConversionOptions
|
||||
import queue
|
||||
import threading
|
||||
import requests
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
BarColumn,
|
||||
TextColumn,
|
||||
TimeRemainingColumn,
|
||||
TimeElapsedColumn,
|
||||
MofNCompleteColumn,
|
||||
SpinnerColumn,
|
||||
TransferSpeedColumn,
|
||||
ProgressColumn,
|
||||
Task,
|
||||
)
|
||||
from rich.console import Console, RenderableType
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
|
||||
conversion_options: ConversionOptions = ConversionOptions(
|
||||
heading_style="atx",
|
||||
list_indent_type="spaces",
|
||||
list_indent_width=2,
|
||||
code_block_style="backticks",
|
||||
autolinks=True,
|
||||
highlight_style="double-equal"
|
||||
)
|
||||
|
||||
session = requests.Session()
|
||||
console = Console()
|
||||
|
||||
BASE_URL: str = "https://eel.is/c++draft"
|
||||
lock = threading.Lock()
|
||||
|
||||
url_queue = queue.Queue()
|
||||
|
||||
|
||||
def get_page_and_add_urls(url_part: str) -> str:
|
||||
res = set()
|
||||
|
||||
response = session.get(f"{BASE_URL}/{url_part}")
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
|
||||
bs = BeautifulSoup(html, "html.parser")
|
||||
|
||||
for search_class in ["abbr_ref", "floded_abbr_ref"]:
|
||||
for href in bs.find_all("a", class_=search_class):
|
||||
url = str(href.attrs["href"])
|
||||
# Нас интересует только основной раздел который идет до первой точки
|
||||
# parts = url.split(".")
|
||||
# if len(parts) >= 1:
|
||||
# res.add(parts[0])
|
||||
url_queue.put(url)
|
||||
|
||||
for url in res:
|
||||
url_queue.put(url)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def convert_html_to_markdown(html: str, url_part: str):
|
||||
md = convert(html, options=conversion_options)
|
||||
|
||||
save_path = DRAFT_DIR / f"{url_part}.md"
|
||||
|
||||
subpath = ""
|
||||
filename: str = url_part
|
||||
if "." in url_part:
|
||||
parts = url_part.split(".")
|
||||
save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve()
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
save_path.write_text(md, encoding="utf-8")
|
||||
|
||||
|
||||
def pull_and_save(url_part: str):
|
||||
try:
|
||||
response = session.get(f"{BASE_URL}/{url_part}")
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
|
||||
convert_html_to_markdown(html, url_part)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
# Первичное получение ссылок
|
||||
# Главная страница визуально содержит небольшое меню
|
||||
# Однако в html там есть ссылки на все разделы
|
||||
convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft")
|
||||
|
||||
try:
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
TransferSpeedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
|
||||
task_id = progress.add_task(
|
||||
"[cyan]Получение страниц...", total=url_queue.qsize()
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor:
|
||||
futures = []
|
||||
|
||||
while not url_queue.empty():
|
||||
url = url_queue.get()
|
||||
future = executor.submit(
|
||||
pull_and_save,
|
||||
url
|
||||
)
|
||||
futures.append(future)
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
progress.advance(task_id)
|
||||
except Exception as e:
|
||||
console.print(e)
|
||||
|
||||
except Exception as e:
|
||||
console.print(e)
|
||||
|
||||
Reference in New Issue
Block a user