Init

2025-10-25 03:02:53 +03:00
commit 043225d523
3416 changed files with 681196 additions and 0 deletions
--- a/pulldraft.py
+++ b/pulldraft.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+
+
+CURRENT_SCRIPT_DIR: Path = Path(__file__).parent
+DRAFT_DIR: Path = CURRENT_SCRIPT_DIR / "cppdraft"
+DRAFT_DIR.mkdir(exist_ok=True)
+
+
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from html_to_markdown import convert, ConversionOptions
+import queue
+import threading
+import requests
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TextColumn,
+    TimeRemainingColumn,
+    TimeElapsedColumn,
+    MofNCompleteColumn,
+    SpinnerColumn,
+    TransferSpeedColumn,
+    ProgressColumn,
+    Task,
+)
+from rich.console import Console, RenderableType
+from rich.panel import Panel
+from rich.text import Text
+from rich.table import Table
+from rich import box
+
+
+conversion_options: ConversionOptions = ConversionOptions(
+    heading_style="atx",
+    list_indent_type="spaces",
+    list_indent_width=2,
+    code_block_style="backticks",
+    autolinks=True,
+    highlight_style="double-equal"
+)
+
+session = requests.Session()
+console = Console()
+
+BASE_URL: str = "https://eel.is/c++draft"
+lock = threading.Lock()
+
+url_queue = queue.Queue()
+
+
+def get_page_and_add_urls(url_part: str) -> str:
+    res = set()
+    
+    response = session.get(f"{BASE_URL}/{url_part}")
+    response.raise_for_status()
+    html = response.text
+    
+    bs = BeautifulSoup(html, "html.parser")
+    
+    for search_class in ["abbr_ref", "floded_abbr_ref"]:
+        for href in bs.find_all("a", class_=search_class):
+            url = str(href.attrs["href"])
+            # Нас интересует только основной раздел который идет до первой точки
+            # parts = url.split(".")
+            # if len(parts) >= 1:
+            #     res.add(parts[0])
+            url_queue.put(url)
+    
+    for url in res:
+        url_queue.put(url)
+    
+    return html
+                
+
+def convert_html_to_markdown(html: str, url_part: str):
+    md = convert(html, options=conversion_options)
+    
+    save_path = DRAFT_DIR / f"{url_part}.md"
+    
+    subpath = ""
+    filename: str = url_part
+    if "." in url_part:
+        parts = url_part.split(".")
+        save_path = (DRAFT_DIR / "/".join(parts[:-1]) / f"{parts[-1]}.md").resolve()
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+    save_path.write_text(md, encoding="utf-8")
+
+
+def pull_and_save(url_part: str):
+    try:
+        response = session.get(f"{BASE_URL}/{url_part}")
+        response.raise_for_status()
+        html = response.text
+        
+        convert_html_to_markdown(html, url_part)
+    except Exception as e:
+        pass
+
+
+# Первичное получение ссылок
+# Главная страница визуально содержит небольшое меню
+# Однако в html там есть ссылки на все разделы
+convert_html_to_markdown(get_page_and_add_urls(""), "cppdraft")
+
+try:
+    with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TransferSpeedColumn(),
+            console=console,
+        ) as progress:
+        
+        task_id = progress.add_task(
+            "[cyan]Получение страниц...", total=url_queue.qsize()
+        )
+        
+        with ThreadPoolExecutor(max_workers=6, thread_name_prefix="PullDraft") as executor:
+            futures = []
+            
+            while not url_queue.empty():
+                url = url_queue.get()
+                future = executor.submit(
+                    pull_and_save,
+                    url
+                )
+                futures.append(future)
+                
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                    progress.advance(task_id)
+                except Exception as e:
+                    console.print(e)
+        
+except Exception as e:
+    console.print(e)
+