Fri Jan 12, 2024 5:20 am
Fri Jan 12, 2024 9:41 am
import requests
from bs4 import BeautifulSoup
def get_page_source(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve page. Status code: {response.status_code}")
return None
def parse_table_rows(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find_all('table', {'class': 'tablebg'})
if len(table) > 1: # Check if there is a second table with the class 'tablebg'
second_table = table[1]
rows = second_table.find_all('tr')
data_array = []
for row in rows[1:25]: # Skip the first row as headers
#print (str(row))
columns = row.find_all(['td', 'th'])
row_data = [column.get_text(strip=True) for column in columns]
# Check if the third column is a link
if len(columns)>=3:
link_td = columns[2]
link = link_td.find('a')
if link:
href_value = link.get('href')
row_data.append(href_value)
data_array.append(row_data)
return data_array
else:
print("Second table not found.")
return None
#print (html_content)
# table = soup.find('table', {'class': 'tablebg'})
# if table:
# rows = table.find_all('tr')
# for row in rows:
# columns = row.find_all(['td', 'th'])
# for column in columns:
# print(column.get_text(strip=True), end='\t')
# print()
if __name__ == "__main__":
arrs = []
for p in range(0,745):
url = "http://gimpchat.com/search.php?st=0&sk=t&sd=d&sr=topics&search_id=active_topics&start=" + str(p*25)
page_source = get_page_source(url)
if page_source:
arr = parse_table_rows(page_source)
arrs = arrs + arr
# for i in range(0,len(arr)):
# print ("<a href='" + str(arr[7])+ "'>" + str(arr[2]))
sorted_array = sorted(arrs, key=lambda x: - int(x[5]))
arr = sorted_array
print ("<table><tr><td>No.</td><td>Topic</td><td>Author</td><td>Views</td></tr>")
for i in range(0,len(arr)):
row = arr[i]
try:
print ("<tr><td>"+str(i+1)+"</td><td><a href='http://gimpchat.com/" + str(row[7])+ "'>" + str(row[2]) + "</a></td><td>"+str(row[3])+"</td><td>"+str(row[5])+"</td></tr>")
except UnicodeEncodeError as e:
continue
python gc.py > test1.html
Fri Jan 12, 2024 9:58 am
Fri Jan 12, 2024 10:04 am
Fri Jan 12, 2024 10:07 am
Fri Jan 12, 2024 10:09 am
Fri Jan 12, 2024 11:18 am
Fri Jan 12, 2024 11:28 am
Fri Jan 12, 2024 11:37 am
Fri Jan 12, 2024 11:39 am
Fri Jan 12, 2024 11:40 am
Fri Jan 12, 2024 11:45 am
trandoductin wrote:it fits fine if you choose "Print" then as PDF instead of saving as PDF
I can't attach it because it's bigger than 6MiB
Fri Jan 12, 2024 11:51 am
Fri Jan 12, 2024 11:57 am
Fri Jan 12, 2024 12:01 pm
Fri Jan 12, 2024 12:16 pm
Fri Jan 12, 2024 1:27 pm
Fri Jan 12, 2024 1:33 pm
Fri Jan 12, 2024 1:40 pm
Fri Jan 12, 2024 1:41 pm