This commit is contained in:
svitvojimilioni@dmz.rs 2023-09-06 22:22:50 -04:00
commit 626946f9b0
21 changed files with 817 additions and 0 deletions

451
.gitignore vendored Normal file
View File

@ -0,0 +1,451 @@
### Flask ###
instance/*
!instance/.gitignore
.webassets-cache
.env
### Flask.Python Stack ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# Azure Toolkit for IntelliJ plugin
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
.idea/**/azureSettings.xml
### PyCharm+all ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
# AWS User-specific
# Generated files
# Sensitive or high-churn files
# Gradle
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
# Mongo Explorer plugin
# File-based project format
# IntelliJ
# mpeltonen/sbt-idea plugin
# JIRA plugin
# Cursive Clojure plugin
# SonarLint plugin
# Crashlytics plugin (for Android Studio and IntelliJ)
# Editor-based Rest Client
# Android studio 3.1+ serialized cache file
### PyCharm+all Patch ###
# Ignore everything but code style settings and run configurations
# that are supposed to be shared within teams.
.idea/*
!.idea/codeStyles
!.idea/runConfigurations
### Python ###
# Byte-compiled / optimized / DLL files
# C extensions
# Distribution / packaging
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
# Installer logs
# Unit test / coverage reports
# Translations
# Django stuff:
# Flask stuff:
# Scrapy stuff:
# Sphinx documentation
# PyBuilder
# Jupyter Notebook
# IPython
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
# Celery stuff
# SageMath parsed files
# Environments
# Spyder project settings
# Rope project settings
# mkdocs documentation
# mypy
# Pyre type checker
# pytype static type analyzer
# Cython debug symbols
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~

16
LICENSE Normal file
View File

@ -0,0 +1,16 @@
dxdy program za analizu vesti i medija
Copyright (C) 2023 svitvojimilioni
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.

15
app/__init__.py Normal file
View File

@ -0,0 +1,15 @@
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
import os
app = Flask(__name__)
config_string = os.environ.get("FLASK_CONFIG", "config.DevelopmentConfig")
app.config.from_object("config.DevelopmentConifg")
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///app.db"
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
db = SQLAlchemy(app)
from . import routes

9
app/forms/site.py Normal file
View File

@ -0,0 +1,9 @@
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
from wtforms.validators import DataRequired
class SiteForm(FlaskForm):
name = StringField('Naziv', validators=[DataRequired()])
base_url = StringField('Link', validators=[DataRequired()])
feed_url = StringField('RSS', validators=[DataRequired()])
submit_url = SubmitField("Dodaj sajt")

11
app/models/article.py Normal file
View File

@ -0,0 +1,11 @@
from app import db
class Article(db.Model):
id = db.Column(db.Integer, primary_key=True)
url = db.Column(db.String(2000), unique=True)
title = db.Column(db.String(2000))
raw_content = db.Column(db.Text)
debloated_content = db.Column(db.Text)
summarized_content = db.Column(db.Text)
site_id = db.Column(db.Integer, db.ForeignKey('site.id'))

5
app/models/category.py Normal file
View File

@ -0,0 +1,5 @@
from . import db
class Category(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(255), unique=True)

11
app/models/site.py Normal file
View File

@ -0,0 +1,11 @@
from app import db
from app.models.article import Article
class Site(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(255))
base_url = db.Column(db.String(1000), unique=True)
feed_url = db.Column(db.String(1000), unique=True)
articles = db.relationship('Article', backref='site')
# TODO add creation date
# TODO add last_updated date

11
app/requirements.txt Normal file
View File

@ -0,0 +1,11 @@
click==8.1.3
Flask==2.2.2
Flask-SQLAlchemy==3.0.2
greenlet==2.0.1
importlib-metadata==5.0.0
itsdangerous==2.1.2
Jinja2==3.1.2
MarkupSafe==2.1.1
SQLAlchemy==1.4.44
Werkzeug==2.2.2
zipp==3.10.0

33
app/routes.py Normal file
View File

@ -0,0 +1,33 @@
from flask import render_template, redirect
from . import app, db
from .models.site import Site
from .forms.site import SiteForm
def get_basic_data():
data = {}
data["APP_NAME"] = app.config["APP_NAME"]
return data
@app.route("/", methods=["GET"])
def index():
data = get_basic_data()
data["sites"] = Site.query.all()
return render_template("pages/index.html", data=data)
@app.route("/add", methods=["GET", "POST"])
def submit_site():
data = get_basic_data()
form = SiteForm()
if form.validate_on_submit():
name = form.name.data
base_url = form.base_url.data
feed_url = form.feed_url.data
site = Site(name=name, base_url=base_url, feed_url=feed_url)
db.session.add(site)
db.session.commit()
return redirect("/")
data["form"] = SiteForm()
print(dir(data["form"]))
return render_template("pages/submit_site.html", data=data, form=data["form"])

3
app/tasks.py Normal file
View File

@ -0,0 +1,3 @@
def example_task(n: int) -> int:
""" Example task"""
return n**n

View File

@ -0,0 +1,3 @@
<footer>
<hr>
</footer>

View File

@ -0,0 +1,4 @@
<header>
<h1>App</h1>
<hr>
</header>

View File

@ -0,0 +1,13 @@
<!doctype html>
<html>
<head>
<title>App</title>
</head>
<body>
<div class="container">
{% include "includes/header.html" %}
{% block content %}{% endblock content %}
{% include "includes/footer.html" %}
</div>
</body>
</html>

View File

@ -0,0 +1,11 @@
{% macro render_text_field(field) %}
<div class="form__group">
{{ field.label(class_="form__label") }}
{{ field(class_="form__field")}}
{%- for error in field.errors %}
<span class="form__error">{{ error }}</span>
{% endfor %}
</div>
{% endmacro %}

View File

@ -0,0 +1,22 @@
{% extends "layouts/base.html" %}
{% block content %}
{% for site in data["sites"] %}
<details>
<summary>
{{site.name}}
</summary>
<ul>
{% for article in site.articles %}
<li>
<details>
<summary><b>{{article.title}}</b></summary>
<p>{{article.summarized_content}}</p>
</details>
</li>
{% endfor %}
</ul>
</details>
{% endfor %}
{% endblock %}

View File

@ -0,0 +1,15 @@
{% from "macros/render_field.html" import render_text_field %}
{% block content %}
<form method="POST">
{{ data["form"].hidden_tag() }}
{{render_text_field(data["form"].name)}}
{{render_text_field(data["form"].base_url)}}
{{render_text_field(data["form"].feed_url)}}
<div>
{{data["form"].submit_url}}
</div>
</form>
{% endblock content %}

4
bootstrap_db.py Normal file
View File

@ -0,0 +1,4 @@
from app import app, db
with app.app_context():
db.create_all()

29
config.py Normal file
View File

@ -0,0 +1,29 @@
from pathlib import Path
import os
class Config:
APP_NAME = "dxdy"
DEBUG = False
TESTING = False
SECRET_KEY = "Change this to something secure"
APP_ROOT = os.path.join(Path(__file__).parent, "app")
DB_NAME = "app.db"
DB_USERNAME = ""
DB_PASSWORD = ""
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_NAME}"
SQLALCHEMY_TRACK_MODIFICATIONS = False
HOST = "127.0.0.1"
UPLOAD_FOLDER = f"{APP_ROOT}/static"
class DevelopmentConifg(Config):
DEBUG = True
class ProductionConfig(Config):
HOST = "0.0.0.0"
class TestingConfig(Config):
TESTING = True

97
full_analysis.py Normal file
View File

@ -0,0 +1,97 @@
from app import db, app
from app.models.site import Site
from app.models.article import Article
import feedparser
import trafilatura
import httpx
from pprint import pprint
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
def get_all_sites() -> list[Site]:
with app.app_context():
all_sites = Site.query.all()
return all_sites
def print_sites(sites:list[Site]) -> None:
for site in sites:
print(site.name)
print(site.base_url)
print(site.feed_url)
print("*" * 10)
def analyze_site(site: Site) -> list[dict]:
feed_url = site.feed_url
feed = feedparser.parse(feed_url)
links = get_article_links_from_feed(feed)
return links
def get_article_links_from_feed(feed: feedparser.util.FeedParserDict) -> list[dict]:
links: list[dict] = []
for entry in feed["entries"]:
tmp_dict = {
"title": entry["title"],
"link": entry["link"]
}
links.append(tmp_dict)
return links
def analyze_sites(sites:list[Site]):
results = []
for site in sites:
articles = analyze_site(site)
import_articles(articles, site)
results.append(articles)
return results
def import_articles(articles: list[dict], site:Site) -> bool:
success = True
for article in articles:
parsed_article = parse_article(article, site)
article = Article(title=parsed_article["title"], url=parsed_article["url"], raw_content=parsed_article["raw_content"], summarized_content=parsed_article["summarized_content"],debloated_content=parsed_article["debloated_content"], site=site)
with app.app_context():
try:
db.session.add(article)
db.session.commit()
print(f"[+] Written {article.title}")
except Exception as e:
print(f"[-] Failed to write {article.title}")
return success
def summarize_article(text, language="english", sentences_count=5):
parser = PlaintextParser.from_string(text, Tokenizer(language))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count)
return ' '.join([str(sentence) for sentence in summary])
def parse_article(article:dict, site:Site) -> dict:
resp = httpx.get(article["link"])
article_raw = resp.text
article_debloated = trafilatura.extract(article_raw)
article_summary = summarize_article(article_debloated)
return {
"title": article["title"],
"url":article["link"],
"raw_content": article_raw,
"debloated_content": article_debloated,
"summarized_content": article_summary,
}
def main():
sites = get_all_sites()
results = analyze_sites(sites)
pprint(results)
if __name__ == "__main__":
main()

50
requirements.txt Normal file
View File

@ -0,0 +1,50 @@
anyio==4.0.0
bcrypt==4.0.1
blinker==1.6.2
breadability==0.1.20
certifi==2023.7.22
chardet==5.2.0
charset-normalizer==3.2.0
click==8.1.7
courlan==0.9.4
dateparser==1.1.8
docopt==0.6.2
feedparser==6.0.10
Flask==2.3.3
Flask-Bcrypt==1.0.1
Flask-SQLAlchemy==3.0.5
Flask-WTF==1.1.1
greenlet==2.0.2
h11==0.14.0
htmldate==1.5.1
httpcore==0.17.3
httpx==0.24.1
idna==3.4
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.3.2
jusText==3.0.0
langcodes==3.3.0
lxml==4.9.3
MarkupSafe==2.1.3
nltk==3.8.1
numpy==1.25.2
pycountry==22.3.5
python-dateutil==2.8.2
pytz==2023.3.post1
readability==0.3.1
regex==2023.8.8
requests==2.31.0
sgmllib3k==1.0.0
six==1.16.0
sniffio==1.3.0
SQLAlchemy==2.0.20
sumy==0.11.0
tld==0.13
tqdm==4.66.1
trafilatura==1.6.2
typing_extensions==4.7.1
tzlocal==5.0.1
urllib3==2.0.4
Werkzeug==2.3.7
WTForms==3.0.1

4
run.py Normal file
View File

@ -0,0 +1,4 @@
from app import app
if __name__ == '__main__':
app.run()