Cleaner way of appending data to List in BeautifulSoup
So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.
from requests import get
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from time import sleep
url = 'https://forum.lowyat.net/ReviewsandGuides'
list_topic =
list_description =
list_replies =
list_topicStarted =
list_totalViews =
def getContentFromURL(_url):
try:
response = get(_url)
html_soup = BeautifulSoup(response.text, 'lxml')
return html_soup
except Exception as e:
print('Error.getContentFromURL:', e)
return None
def iterateThroughPages(_lastindexpost, _postperpage, _url):
indices = '/+'
index = 0
for i in range(index, _lastindexpost):
print('Getting data from ' + url)
try:
extractDataFromRow1(getContentFromURL(_url))
extractDataFromRow2(getContentFromURL(_url))
print('current page index is: ' + str(index))
print(_url)
while i <= _lastindexpost:
for table in get(_url):
if table != None:
new_getPostPerPage = i + _postperpage
newlink = f'{url}{indices}{new_getPostPerPage}'
print(newlink)
bs_link = getContentFromURL(newlink)
extractDataFromRow1(bs_link)
extractDataFromRow2(bs_link)
# threading to prevent spam. Waits 0.5 secs before executing
sleep(0.5)
i += _postperpage
print('current page index is: ' + str(i))
if i > _lastindexpost:
# If i gets more than the input page(etc 1770) halts
print('No more available post to retrieve')
return
except Exception as e:
print('Error.iterateThroughPages:', e)
return None
def extractDataFromRow1(_url):
try:
for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
# get data from topic title in table cell
topic = container.select_one(
'a[href^="/topic/"]').text.replace("n", "")
description = container.select_one(
'div.desc').text.replace("n", "")
if topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description is '':
dict_description = 'No Data'
# list_description.append(dict_description)
#so no empty string#
list_topic.append(dict_topic)
list_description.append(dict_description)
else:
None
except Exception as e:
print('Error.extractDataFromRow1:', e)
return None
def extractDataFromRow2(_url):
try:
for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
replies = container.select_one('td:nth-of-type(4)').text.strip()
topic_started = container.select_one(
'td:nth-of-type(5)').text.strip()
total_views = container.select_one(
'td:nth-of-type(6)').text.strip()
if replies or topic_started or total_views is not None:
dict_replies = replies
dict_topicStarted = topic_started
dict_totalViews = total_views
if dict_replies is '':
dict_replies = 'No Data'
elif dict_topicStarted is '':
dict_topicStarted = 'No Data'
elif dict_totalViews is '':
dict_totalViews = 'No Data'
list_replies.append(dict_replies)
list_topicStarted.append(dict_topicStarted)
list_totalViews.append(dict_totalViews)
else:
print('no data')
None
except Exception as e:
print('Error.extractDataFromRow2:', e)
return None
# limit to 1740
print(iterateThroughPages(1740, 30, url))
new_panda = pd.DataFrame(
{'Title': list_topic, 'Description': list_description,
'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
print(new_panda)
I'm sure the use of my try
is redundant at this point as well, my large variety of List including, and the use of While
and For
is most likely practiced wrongly.
python python-3.x web-scraping beautifulsoup
New contributor
add a comment |
So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.
from requests import get
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from time import sleep
url = 'https://forum.lowyat.net/ReviewsandGuides'
list_topic =
list_description =
list_replies =
list_topicStarted =
list_totalViews =
def getContentFromURL(_url):
try:
response = get(_url)
html_soup = BeautifulSoup(response.text, 'lxml')
return html_soup
except Exception as e:
print('Error.getContentFromURL:', e)
return None
def iterateThroughPages(_lastindexpost, _postperpage, _url):
indices = '/+'
index = 0
for i in range(index, _lastindexpost):
print('Getting data from ' + url)
try:
extractDataFromRow1(getContentFromURL(_url))
extractDataFromRow2(getContentFromURL(_url))
print('current page index is: ' + str(index))
print(_url)
while i <= _lastindexpost:
for table in get(_url):
if table != None:
new_getPostPerPage = i + _postperpage
newlink = f'{url}{indices}{new_getPostPerPage}'
print(newlink)
bs_link = getContentFromURL(newlink)
extractDataFromRow1(bs_link)
extractDataFromRow2(bs_link)
# threading to prevent spam. Waits 0.5 secs before executing
sleep(0.5)
i += _postperpage
print('current page index is: ' + str(i))
if i > _lastindexpost:
# If i gets more than the input page(etc 1770) halts
print('No more available post to retrieve')
return
except Exception as e:
print('Error.iterateThroughPages:', e)
return None
def extractDataFromRow1(_url):
try:
for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
# get data from topic title in table cell
topic = container.select_one(
'a[href^="/topic/"]').text.replace("n", "")
description = container.select_one(
'div.desc').text.replace("n", "")
if topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description is '':
dict_description = 'No Data'
# list_description.append(dict_description)
#so no empty string#
list_topic.append(dict_topic)
list_description.append(dict_description)
else:
None
except Exception as e:
print('Error.extractDataFromRow1:', e)
return None
def extractDataFromRow2(_url):
try:
for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
replies = container.select_one('td:nth-of-type(4)').text.strip()
topic_started = container.select_one(
'td:nth-of-type(5)').text.strip()
total_views = container.select_one(
'td:nth-of-type(6)').text.strip()
if replies or topic_started or total_views is not None:
dict_replies = replies
dict_topicStarted = topic_started
dict_totalViews = total_views
if dict_replies is '':
dict_replies = 'No Data'
elif dict_topicStarted is '':
dict_topicStarted = 'No Data'
elif dict_totalViews is '':
dict_totalViews = 'No Data'
list_replies.append(dict_replies)
list_topicStarted.append(dict_topicStarted)
list_totalViews.append(dict_totalViews)
else:
print('no data')
None
except Exception as e:
print('Error.extractDataFromRow2:', e)
return None
# limit to 1740
print(iterateThroughPages(1740, 30, url))
new_panda = pd.DataFrame(
{'Title': list_topic, 'Description': list_description,
'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
print(new_panda)
I'm sure the use of my try
is redundant at this point as well, my large variety of List including, and the use of While
and For
is most likely practiced wrongly.
python python-3.x web-scraping beautifulsoup
New contributor
add a comment |
So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.
from requests import get
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from time import sleep
url = 'https://forum.lowyat.net/ReviewsandGuides'
list_topic =
list_description =
list_replies =
list_topicStarted =
list_totalViews =
def getContentFromURL(_url):
try:
response = get(_url)
html_soup = BeautifulSoup(response.text, 'lxml')
return html_soup
except Exception as e:
print('Error.getContentFromURL:', e)
return None
def iterateThroughPages(_lastindexpost, _postperpage, _url):
indices = '/+'
index = 0
for i in range(index, _lastindexpost):
print('Getting data from ' + url)
try:
extractDataFromRow1(getContentFromURL(_url))
extractDataFromRow2(getContentFromURL(_url))
print('current page index is: ' + str(index))
print(_url)
while i <= _lastindexpost:
for table in get(_url):
if table != None:
new_getPostPerPage = i + _postperpage
newlink = f'{url}{indices}{new_getPostPerPage}'
print(newlink)
bs_link = getContentFromURL(newlink)
extractDataFromRow1(bs_link)
extractDataFromRow2(bs_link)
# threading to prevent spam. Waits 0.5 secs before executing
sleep(0.5)
i += _postperpage
print('current page index is: ' + str(i))
if i > _lastindexpost:
# If i gets more than the input page(etc 1770) halts
print('No more available post to retrieve')
return
except Exception as e:
print('Error.iterateThroughPages:', e)
return None
def extractDataFromRow1(_url):
try:
for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
# get data from topic title in table cell
topic = container.select_one(
'a[href^="/topic/"]').text.replace("n", "")
description = container.select_one(
'div.desc').text.replace("n", "")
if topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description is '':
dict_description = 'No Data'
# list_description.append(dict_description)
#so no empty string#
list_topic.append(dict_topic)
list_description.append(dict_description)
else:
None
except Exception as e:
print('Error.extractDataFromRow1:', e)
return None
def extractDataFromRow2(_url):
try:
for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
replies = container.select_one('td:nth-of-type(4)').text.strip()
topic_started = container.select_one(
'td:nth-of-type(5)').text.strip()
total_views = container.select_one(
'td:nth-of-type(6)').text.strip()
if replies or topic_started or total_views is not None:
dict_replies = replies
dict_topicStarted = topic_started
dict_totalViews = total_views
if dict_replies is '':
dict_replies = 'No Data'
elif dict_topicStarted is '':
dict_topicStarted = 'No Data'
elif dict_totalViews is '':
dict_totalViews = 'No Data'
list_replies.append(dict_replies)
list_topicStarted.append(dict_topicStarted)
list_totalViews.append(dict_totalViews)
else:
print('no data')
None
except Exception as e:
print('Error.extractDataFromRow2:', e)
return None
# limit to 1740
print(iterateThroughPages(1740, 30, url))
new_panda = pd.DataFrame(
{'Title': list_topic, 'Description': list_description,
'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
print(new_panda)
I'm sure the use of my try
is redundant at this point as well, my large variety of List including, and the use of While
and For
is most likely practiced wrongly.
python python-3.x web-scraping beautifulsoup
New contributor
So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.
from requests import get
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from time import sleep
url = 'https://forum.lowyat.net/ReviewsandGuides'
list_topic =
list_description =
list_replies =
list_topicStarted =
list_totalViews =
def getContentFromURL(_url):
try:
response = get(_url)
html_soup = BeautifulSoup(response.text, 'lxml')
return html_soup
except Exception as e:
print('Error.getContentFromURL:', e)
return None
def iterateThroughPages(_lastindexpost, _postperpage, _url):
indices = '/+'
index = 0
for i in range(index, _lastindexpost):
print('Getting data from ' + url)
try:
extractDataFromRow1(getContentFromURL(_url))
extractDataFromRow2(getContentFromURL(_url))
print('current page index is: ' + str(index))
print(_url)
while i <= _lastindexpost:
for table in get(_url):
if table != None:
new_getPostPerPage = i + _postperpage
newlink = f'{url}{indices}{new_getPostPerPage}'
print(newlink)
bs_link = getContentFromURL(newlink)
extractDataFromRow1(bs_link)
extractDataFromRow2(bs_link)
# threading to prevent spam. Waits 0.5 secs before executing
sleep(0.5)
i += _postperpage
print('current page index is: ' + str(i))
if i > _lastindexpost:
# If i gets more than the input page(etc 1770) halts
print('No more available post to retrieve')
return
except Exception as e:
print('Error.iterateThroughPages:', e)
return None
def extractDataFromRow1(_url):
try:
for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
# get data from topic title in table cell
topic = container.select_one(
'a[href^="/topic/"]').text.replace("n", "")
description = container.select_one(
'div.desc').text.replace("n", "")
if topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description is '':
dict_description = 'No Data'
# list_description.append(dict_description)
#so no empty string#
list_topic.append(dict_topic)
list_description.append(dict_description)
else:
None
except Exception as e:
print('Error.extractDataFromRow1:', e)
return None
def extractDataFromRow2(_url):
try:
for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
replies = container.select_one('td:nth-of-type(4)').text.strip()
topic_started = container.select_one(
'td:nth-of-type(5)').text.strip()
total_views = container.select_one(
'td:nth-of-type(6)').text.strip()
if replies or topic_started or total_views is not None:
dict_replies = replies
dict_topicStarted = topic_started
dict_totalViews = total_views
if dict_replies is '':
dict_replies = 'No Data'
elif dict_topicStarted is '':
dict_topicStarted = 'No Data'
elif dict_totalViews is '':
dict_totalViews = 'No Data'
list_replies.append(dict_replies)
list_topicStarted.append(dict_topicStarted)
list_totalViews.append(dict_totalViews)
else:
print('no data')
None
except Exception as e:
print('Error.extractDataFromRow2:', e)
return None
# limit to 1740
print(iterateThroughPages(1740, 30, url))
new_panda = pd.DataFrame(
{'Title': list_topic, 'Description': list_description,
'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
print(new_panda)
I'm sure the use of my try
is redundant at this point as well, my large variety of List including, and the use of While
and For
is most likely practiced wrongly.
python python-3.x web-scraping beautifulsoup
python python-3.x web-scraping beautifulsoup
New contributor
New contributor
New contributor
asked 2 mins ago
MinialMinial
134
134
New contributor
New contributor
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Minial is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211238%2fcleaner-way-of-appending-data-to-list-in-beautifulsoup%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Minial is a new contributor. Be nice, and check out our Code of Conduct.
Minial is a new contributor. Be nice, and check out our Code of Conduct.
Minial is a new contributor. Be nice, and check out our Code of Conduct.
Minial is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211238%2fcleaner-way-of-appending-data-to-list-in-beautifulsoup%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown