98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
|
from app import db, app
|
||
|
from app.models.site import Site
|
||
|
from app.models.article import Article
|
||
|
|
||
|
import feedparser
|
||
|
import trafilatura
|
||
|
import httpx
|
||
|
from pprint import pprint
|
||
|
|
||
|
from sumy.parsers.plaintext import PlaintextParser
|
||
|
from sumy.nlp.tokenizers import Tokenizer
|
||
|
from sumy.summarizers.lsa import LsaSummarizer
|
||
|
|
||
|
|
||
|
def get_all_sites() -> list[Site]:
|
||
|
with app.app_context():
|
||
|
all_sites = Site.query.all()
|
||
|
return all_sites
|
||
|
|
||
|
def print_sites(sites:list[Site]) -> None:
|
||
|
for site in sites:
|
||
|
print(site.name)
|
||
|
print(site.base_url)
|
||
|
print(site.feed_url)
|
||
|
print("*" * 10)
|
||
|
|
||
|
def analyze_site(site: Site) -> list[dict]:
|
||
|
feed_url = site.feed_url
|
||
|
feed = feedparser.parse(feed_url)
|
||
|
links = get_article_links_from_feed(feed)
|
||
|
return links
|
||
|
|
||
|
def get_article_links_from_feed(feed: feedparser.util.FeedParserDict) -> list[dict]:
|
||
|
links: list[dict] = []
|
||
|
for entry in feed["entries"]:
|
||
|
tmp_dict = {
|
||
|
"title": entry["title"],
|
||
|
"link": entry["link"]
|
||
|
}
|
||
|
links.append(tmp_dict)
|
||
|
return links
|
||
|
|
||
|
|
||
|
def analyze_sites(sites:list[Site]):
|
||
|
results = []
|
||
|
for site in sites:
|
||
|
articles = analyze_site(site)
|
||
|
import_articles(articles, site)
|
||
|
results.append(articles)
|
||
|
|
||
|
return results
|
||
|
|
||
|
|
||
|
def import_articles(articles: list[dict], site:Site) -> bool:
|
||
|
success = True
|
||
|
for article in articles:
|
||
|
parsed_article = parse_article(article, site)
|
||
|
article = Article(title=parsed_article["title"], url=parsed_article["url"], raw_content=parsed_article["raw_content"], summarized_content=parsed_article["summarized_content"],debloated_content=parsed_article["debloated_content"], site=site)
|
||
|
with app.app_context():
|
||
|
try:
|
||
|
db.session.add(article)
|
||
|
db.session.commit()
|
||
|
print(f"[+] Written {article.title}")
|
||
|
except Exception as e:
|
||
|
print(f"[-] Failed to write {article.title}")
|
||
|
return success
|
||
|
|
||
|
def summarize_article(text, language="english", sentences_count=5):
|
||
|
parser = PlaintextParser.from_string(text, Tokenizer(language))
|
||
|
summarizer = LsaSummarizer()
|
||
|
summary = summarizer(parser.document, sentences_count)
|
||
|
return ' '.join([str(sentence) for sentence in summary])
|
||
|
|
||
|
def parse_article(article:dict, site:Site) -> dict:
|
||
|
resp = httpx.get(article["link"])
|
||
|
article_raw = resp.text
|
||
|
article_debloated = trafilatura.extract(article_raw)
|
||
|
article_summary = summarize_article(article_debloated)
|
||
|
return {
|
||
|
"title": article["title"],
|
||
|
"url":article["link"],
|
||
|
"raw_content": article_raw,
|
||
|
"debloated_content": article_debloated,
|
||
|
"summarized_content": article_summary,
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
def main():
|
||
|
sites = get_all_sites()
|
||
|
results = analyze_sites(sites)
|
||
|
pprint(results)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|
||
|
|