From e89201cb4bbef69aa1927ef69327fe6fa3444cba Mon Sep 17 00:00:00 2001 From: texhno Date: Tue, 6 Feb 2024 02:04:47 +0100 Subject: [PATCH] Added scraper functions (wikipedia,for now) --- functions.py | 8 +++++--- requirements.txt | 2 ++ scraper_functions.py | 9 +++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 scraper_functions.py diff --git a/functions.py b/functions.py index 6a2fc5c..6778156 100644 --- a/functions.py +++ b/functions.py @@ -1,8 +1,10 @@ +import scraper_functions as sf + def processmsg(msg, rcpt): - if msg.startswith("!"): - return command(msg, rcpt) - elif "youtube.com/watch" in msg: + if "youtube.com/watch" in msg: return msg.replace("youtube.com", "iv.datura.network") + elif msg.startswith("!wiki"): + return sf.query_external_website("https://en.wikipedia.org/wiki/", msg.split(" ")[1]) def command(msg, rcpt): if msg.startswith("!help"): diff --git a/requirements.txt b/requirements.txt index 5593b66..f8acc82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ slixmpp +requests +beautifulsoup4 diff --git a/scraper_functions.py b/scraper_functions.py new file mode 100644 index 0000000..a6f3891 --- /dev/null +++ b/scraper_functions.py @@ -0,0 +1,9 @@ +import requests +from bs4 import BeautifulSoup + +def query_external_website(base_url, query): + page = requests.get(base_url + query) + soup = BeautifulSoup(page.content, "html.parser") + title = soup.select(".mw-page-title-main")[0] + content = soup.find(id="bodyContent").select("p")[2].text + return "\nTITLE: " + title.text + "\n\n" + "CONTENT:" + "\n" + content + "\n\n" + "FULL LINK:\n" + base_url + query