diff --git a/functions.py b/functions.py index 57fd81a..556c6a5 100644 --- a/functions.py +++ b/functions.py @@ -5,7 +5,7 @@ def processmsg(msg, rcpt): if "youtube.com/watch" in msg: return msg.replace("youtube.com", "iv.datura.network") elif msg.startswith("!"): - return command(msg) + return command(msg) elif "good bot" in msg: return "^_^" @@ -21,5 +21,6 @@ def command(msg, rcpt): response = client.chat(model='llama2-uncensored:latest', messages=[{'role':'user','content':f'{msg[4:]}'}]) return(response['message']['content']) elif msg.startswith("!wiki"): - return sf.query_external_website("https://en.wikipedia.org/wiki/", msg.split(" ")[1]) + cmd, query = msg.split(" ", 1) + return sf.query_external_website("https://en.wikipedia.org", "/wiki/" + query) diff --git a/scraper_functions.py b/scraper_functions.py index 5b9cd9f..502f44f 100644 --- a/scraper_functions.py +++ b/scraper_functions.py @@ -1,12 +1,19 @@ import requests from bs4 import BeautifulSoup +from urllib.parse import quote def query_external_website(base_url, query): try: - page = requests.get(base_url + query) + page = requests.get(base_url + quote(query)) soup = BeautifulSoup(page.content, "html.parser") - title = soup.find("span", class_="mw-page-title-main").text - content = soup.find(id="mw-content-text").select("p")[2].text - return "\nTITLE:\n" + title + "\n\nCONTENT:\n" + content + "\n\nFULL LINK:\n" + base_url + query - except: - return "Can't parse search result :(" + title = soup.find(id="firstHeading").text + mainContentElement = soup.find(id="mw-content-text") + if "This page is a redirect" in mainContentElement.text: + redirectLink = mainContentElement.find(class_="redirectMsg").find_all("a")[0]["href"] + return query_external_website(base_url, redirectLink) + content = next((paragraph for paragraph in mainContentElement.select("p") if not paragraph.has_attr("class")), None) + if content == None: + raise Exception("Can't parse search result :(") + return "\nTITLE:\n" + title + "\n\nCONTENT:\n" + content.text + "\n\nFULL LINK:\n" + base_url + quote(query) + except Exception as e: + return e