From e89201cb4bbef69aa1927ef69327fe6fa3444cba Mon Sep 17 00:00:00 2001 From: texhno Date: Tue, 6 Feb 2024 02:04:47 +0100 Subject: [PATCH 1/2] Added scraper functions (wikipedia,for now) --- functions.py | 8 +++++--- requirements.txt | 2 ++ scraper_functions.py | 9 +++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 scraper_functions.py diff --git a/functions.py b/functions.py index 6a2fc5c..6778156 100644 --- a/functions.py +++ b/functions.py @@ -1,8 +1,10 @@ +import scraper_functions as sf + def processmsg(msg, rcpt): - if msg.startswith("!"): - return command(msg, rcpt) - elif "youtube.com/watch" in msg: + if "youtube.com/watch" in msg: return msg.replace("youtube.com", "iv.datura.network") + elif msg.startswith("!wiki"): + return sf.query_external_website("https://en.wikipedia.org/wiki/", msg.split(" ")[1]) def command(msg, rcpt): if msg.startswith("!help"): diff --git a/requirements.txt b/requirements.txt index 5593b66..f8acc82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ slixmpp +requests +beautifulsoup4 diff --git a/scraper_functions.py b/scraper_functions.py new file mode 100644 index 0000000..a6f3891 --- /dev/null +++ b/scraper_functions.py @@ -0,0 +1,9 @@ +import requests +from bs4 import BeautifulSoup + +def query_external_website(base_url, query): + page = requests.get(base_url + query) + soup = BeautifulSoup(page.content, "html.parser") + title = soup.select(".mw-page-title-main")[0] + content = soup.find(id="bodyContent").select("p")[2].text + return "\nTITLE: " + title.text + "\n\n" + "CONTENT:" + "\n" + content + "\n\n" + "FULL LINK:\n" + base_url + query From 834934fccd09dc27d5dfbe5aaa8ed3b029755fb2 Mon Sep 17 00:00:00 2001 From: texhno Date: Tue, 6 Feb 2024 02:21:53 +0100 Subject: [PATCH 2/2] Added scraper function --- functions.py | 1 + requirements.txt | 2 ++ scraper_functions.py | 9 +++++++++ 3 files changed, 12 insertions(+) create mode 100644 scraper_functions.py diff --git a/functions.py b/functions.py index d722e49..4452e9b 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,5 @@ import ollama +import scraper_functions as sf def processmsg(msg, rcpt): if "youtube.com/watch" in msg: diff --git a/requirements.txt b/requirements.txt index e1e7e30..b685df2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ slixmpp ollama +requests +beautifulsoup4 diff --git a/scraper_functions.py b/scraper_functions.py new file mode 100644 index 0000000..2f0445a --- /dev/null +++ b/scraper_functions.py @@ -0,0 +1,9 @@ +import requests +from bs4 import BeautifulSoup + +def query_external_website(base_url, query): + page = requests.get(base_url + query) + soup = BeautifulSoup(page.content, "html.parser") + title = soup.find("span", class_="mw-page-title-main").text + content = soup.find(id="mw-content-text").select("p")[2].text + return "\nTITLE:\n" + title + "\n\nCONTENT:\n" + content + "\n\nFULL LINK:\n" + base_url + query