from app import db, app from app.models.site import Site from app.models.article import Article import feedparser import trafilatura import httpx from pprint import pprint from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer def get_all_sites() -> list[Site]: with app.app_context(): all_sites = Site.query.all() return all_sites def print_sites(sites:list[Site]) -> None: for site in sites: print(site.name) print(site.base_url) print(site.feed_url) print("*" * 10) def analyze_site(site: Site) -> list[dict]: feed_url = site.feed_url feed = feedparser.parse(feed_url) links = get_article_links_from_feed(feed) return links def get_article_links_from_feed(feed: feedparser.util.FeedParserDict) -> list[dict]: links: list[dict] = [] for entry in feed["entries"]: tmp_dict = { "title": entry["title"], "link": entry["link"] } links.append(tmp_dict) return links def analyze_sites(sites:list[Site]): results = [] for site in sites: articles = analyze_site(site) import_articles(articles, site) results.append(articles) return results def import_articles(articles: list[dict], site:Site) -> bool: success = True for article in articles: parsed_article = parse_article(article, site) article = Article(title=parsed_article["title"], url=parsed_article["url"], raw_content=parsed_article["raw_content"], summarized_content=parsed_article["summarized_content"],debloated_content=parsed_article["debloated_content"], site=site) with app.app_context(): try: db.session.add(article) db.session.commit() print(f"[+] Written {article.title}") except Exception as e: print(f"[-] Failed to write {article.title}") return success def summarize_article(text, language="english", sentences_count=5): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer = LsaSummarizer() summary = summarizer(parser.document, sentences_count) return ' '.join([str(sentence) for sentence in summary]) def parse_article(article:dict, site:Site) -> dict: resp = httpx.get(article["link"]) article_raw = resp.text article_debloated = trafilatura.extract(article_raw) article_summary = summarize_article(article_debloated) return { "title": article["title"], "url":article["link"], "raw_content": article_raw, "debloated_content": article_debloated, "summarized_content": article_summary, } def main(): sites = get_all_sites() results = analyze_sites(sites) pprint(results) if __name__ == "__main__": main()