dxdy/full_analysis.py

98 lines
2.9 KiB
Python

from app import db, app
from app.models.site import Site
from app.models.article import Article
import feedparser
import trafilatura
import httpx
from pprint import pprint
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
def get_all_sites() -> list[Site]:
with app.app_context():
all_sites = Site.query.all()
return all_sites
def print_sites(sites:list[Site]) -> None:
for site in sites:
print(site.name)
print(site.base_url)
print(site.feed_url)
print("*" * 10)
def analyze_site(site: Site) -> list[dict]:
feed_url = site.feed_url
feed = feedparser.parse(feed_url)
links = get_article_links_from_feed(feed)
return links
def get_article_links_from_feed(feed: feedparser.util.FeedParserDict) -> list[dict]:
links: list[dict] = []
for entry in feed["entries"]:
tmp_dict = {
"title": entry["title"],
"link": entry["link"]
}
links.append(tmp_dict)
return links
def analyze_sites(sites:list[Site]):
results = []
for site in sites:
articles = analyze_site(site)
import_articles(articles, site)
results.append(articles)
return results
def import_articles(articles: list[dict], site:Site) -> bool:
success = True
for article in articles:
parsed_article = parse_article(article, site)
article = Article(title=parsed_article["title"], url=parsed_article["url"], raw_content=parsed_article["raw_content"], summarized_content=parsed_article["summarized_content"],debloated_content=parsed_article["debloated_content"], site=site)
with app.app_context():
try:
db.session.add(article)
db.session.commit()
print(f"[+] Written {article.title}")
except Exception as e:
print(f"[-] Failed to write {article.title}")
return success
def summarize_article(text, language="english", sentences_count=5):
parser = PlaintextParser.from_string(text, Tokenizer(language))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count)
return ' '.join([str(sentence) for sentence in summary])
def parse_article(article:dict, site:Site) -> dict:
resp = httpx.get(article["link"])
article_raw = resp.text
article_debloated = trafilatura.extract(article_raw)
article_summary = summarize_article(article_debloated)
return {
"title": article["title"],
"url":article["link"],
"raw_content": article_raw,
"debloated_content": article_debloated,
"summarized_content": article_summary,
}
def main():
sites = get_all_sites()
results = analyze_sites(sites)
pprint(results)
if __name__ == "__main__":
main()