chatbot/scraper_functions.py

50 lines
2.1 KiB
Python
Raw Normal View History

2024-02-06 01:21:53 +00:00
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
2024-02-06 01:21:53 +00:00
2024-02-06 21:06:06 +00:00
def getSoup(base_url, query = ""):
page = requests.get(base_url + quote(query))
soup = BeautifulSoup(page.content, "html.parser")
return soup
2024-02-06 01:21:53 +00:00
def query_external_website(base_url, query):
2024-02-06 01:37:19 +00:00
try:
2024-02-06 21:06:06 +00:00
soup = getSoup(base_url, query)
2024-02-06 02:46:17 +00:00
title = soup.find(id="firstHeading").text
2024-02-06 10:39:10 +00:00
mainContentElement = soup.find(id="mw-content-text")
if "This page is a redirect" in mainContentElement.text:
redirectLink = mainContentElement.find(class_="redirectMsg").find_all("a")[0]["href"]
return query_external_website(base_url, redirectLink)
content = next((paragraph for paragraph in mainContentElement.select("p") if not paragraph.has_attr("class")), None)
2024-02-06 02:37:26 +00:00
if content == None:
2024-02-06 10:39:10 +00:00
raise Exception("Can't parse search result :(")
return "\nTITLE:\n" + title + "\n\nCONTENT:\n" + content.text + "\n\nFULL LINK:\n" + base_url + quote(query)
2024-02-06 10:39:10 +00:00
except Exception as e:
return e
2024-02-06 20:17:49 +00:00
2024-02-06 20:24:04 +00:00
def getDmzTasks(url):
2024-02-06 20:17:49 +00:00
try:
2024-02-06 21:06:06 +00:00
soup = getSoup(url)
2024-02-06 20:17:49 +00:00
tasks = soup.find_all(class_="task")
result = "\nActive tasks:\n"
for task in tasks:
taskIndex = task.select("div")[0].text
taskTitle = task.select("div")[1].text
2024-02-06 21:06:06 +00:00
result += taskIndex + " " + taskTitle
taskSoup = getSoup(url + task.find("a")["href"][1:])
description = taskSoup.find("main").select("section")[0].find("p").text
result += "\n\tDescription:\n" + "\t\t" + description + "\n"
result += "\tAssigned users:\n" + "\t\t"
assignedUsers = taskSoup.find_all(class_="user-info-wrap")
if len(assignedUsers) == 0:
result += "None! Be the first :)\n"
result += "\tLink: " + url + task.find("a")["href"][1:] + "\n\n"
continue
usersList = ""
for user in assignedUsers:
usersList += user.find("div").text.split(": ")[1] + ", "
result += usersList[:-2] + "\n\n"
2024-02-06 20:17:49 +00:00
return result
except Exception as e:
return e