Cleaner way of appending data to List in BeautifulSoup












0














So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.



from requests import get
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from time import sleep


url = 'https://forum.lowyat.net/ReviewsandGuides'

list_topic =
list_description =
list_replies =
list_topicStarted =
list_totalViews =


def getContentFromURL(_url):
try:
response = get(_url)
html_soup = BeautifulSoup(response.text, 'lxml')
return html_soup
except Exception as e:
print('Error.getContentFromURL:', e)
return None


def iterateThroughPages(_lastindexpost, _postperpage, _url):
indices = '/+'
index = 0
for i in range(index, _lastindexpost):
print('Getting data from ' + url)
try:
extractDataFromRow1(getContentFromURL(_url))
extractDataFromRow2(getContentFromURL(_url))
print('current page index is: ' + str(index))
print(_url)
while i <= _lastindexpost:
for table in get(_url):
if table != None:
new_getPostPerPage = i + _postperpage
newlink = f'{url}{indices}{new_getPostPerPage}'
print(newlink)
bs_link = getContentFromURL(newlink)
extractDataFromRow1(bs_link)
extractDataFromRow2(bs_link)
# threading to prevent spam. Waits 0.5 secs before executing
sleep(0.5)
i += _postperpage
print('current page index is: ' + str(i))
if i > _lastindexpost:
# If i gets more than the input page(etc 1770) halts
print('No more available post to retrieve')
return
except Exception as e:
print('Error.iterateThroughPages:', e)
return None


def extractDataFromRow1(_url):
try:
for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
# get data from topic title in table cell
topic = container.select_one(
'a[href^="/topic/"]').text.replace("n", "")
description = container.select_one(
'div.desc').text.replace("n", "")
if topic or description is not None:
dict_topic = topic
dict_description = description
if dict_description is '':
dict_description = 'No Data'
# list_description.append(dict_description)
#so no empty string#
list_topic.append(dict_topic)
list_description.append(dict_description)
else:
None
except Exception as e:
print('Error.extractDataFromRow1:', e)
return None


def extractDataFromRow2(_url):
try:
for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
replies = container.select_one('td:nth-of-type(4)').text.strip()
topic_started = container.select_one(
'td:nth-of-type(5)').text.strip()
total_views = container.select_one(
'td:nth-of-type(6)').text.strip()
if replies or topic_started or total_views is not None:
dict_replies = replies
dict_topicStarted = topic_started
dict_totalViews = total_views
if dict_replies is '':
dict_replies = 'No Data'
elif dict_topicStarted is '':
dict_topicStarted = 'No Data'
elif dict_totalViews is '':
dict_totalViews = 'No Data'
list_replies.append(dict_replies)
list_topicStarted.append(dict_topicStarted)
list_totalViews.append(dict_totalViews)
else:
print('no data')
None
except Exception as e:
print('Error.extractDataFromRow2:', e)
return None


# limit to 1740
print(iterateThroughPages(1740, 30, url))
new_panda = pd.DataFrame(
{'Title': list_topic, 'Description': list_description,
'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
print(new_panda)


I'm sure the use of my try is redundant at this point as well, my large variety of List including, and the use of While and For is most likely practiced wrongly.









share







New contributor




Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

























    0














    So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.



    from requests import get
    from bs4 import BeautifulSoup
    from pprint import pprint
    import pandas as pd
    from time import sleep


    url = 'https://forum.lowyat.net/ReviewsandGuides'

    list_topic =
    list_description =
    list_replies =
    list_topicStarted =
    list_totalViews =


    def getContentFromURL(_url):
    try:
    response = get(_url)
    html_soup = BeautifulSoup(response.text, 'lxml')
    return html_soup
    except Exception as e:
    print('Error.getContentFromURL:', e)
    return None


    def iterateThroughPages(_lastindexpost, _postperpage, _url):
    indices = '/+'
    index = 0
    for i in range(index, _lastindexpost):
    print('Getting data from ' + url)
    try:
    extractDataFromRow1(getContentFromURL(_url))
    extractDataFromRow2(getContentFromURL(_url))
    print('current page index is: ' + str(index))
    print(_url)
    while i <= _lastindexpost:
    for table in get(_url):
    if table != None:
    new_getPostPerPage = i + _postperpage
    newlink = f'{url}{indices}{new_getPostPerPage}'
    print(newlink)
    bs_link = getContentFromURL(newlink)
    extractDataFromRow1(bs_link)
    extractDataFromRow2(bs_link)
    # threading to prevent spam. Waits 0.5 secs before executing
    sleep(0.5)
    i += _postperpage
    print('current page index is: ' + str(i))
    if i > _lastindexpost:
    # If i gets more than the input page(etc 1770) halts
    print('No more available post to retrieve')
    return
    except Exception as e:
    print('Error.iterateThroughPages:', e)
    return None


    def extractDataFromRow1(_url):
    try:
    for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
    # get data from topic title in table cell
    topic = container.select_one(
    'a[href^="/topic/"]').text.replace("n", "")
    description = container.select_one(
    'div.desc').text.replace("n", "")
    if topic or description is not None:
    dict_topic = topic
    dict_description = description
    if dict_description is '':
    dict_description = 'No Data'
    # list_description.append(dict_description)
    #so no empty string#
    list_topic.append(dict_topic)
    list_description.append(dict_description)
    else:
    None
    except Exception as e:
    print('Error.extractDataFromRow1:', e)
    return None


    def extractDataFromRow2(_url):
    try:
    for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
    replies = container.select_one('td:nth-of-type(4)').text.strip()
    topic_started = container.select_one(
    'td:nth-of-type(5)').text.strip()
    total_views = container.select_one(
    'td:nth-of-type(6)').text.strip()
    if replies or topic_started or total_views is not None:
    dict_replies = replies
    dict_topicStarted = topic_started
    dict_totalViews = total_views
    if dict_replies is '':
    dict_replies = 'No Data'
    elif dict_topicStarted is '':
    dict_topicStarted = 'No Data'
    elif dict_totalViews is '':
    dict_totalViews = 'No Data'
    list_replies.append(dict_replies)
    list_topicStarted.append(dict_topicStarted)
    list_totalViews.append(dict_totalViews)
    else:
    print('no data')
    None
    except Exception as e:
    print('Error.extractDataFromRow2:', e)
    return None


    # limit to 1740
    print(iterateThroughPages(1740, 30, url))
    new_panda = pd.DataFrame(
    {'Title': list_topic, 'Description': list_description,
    'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
    print(new_panda)


    I'm sure the use of my try is redundant at this point as well, my large variety of List including, and the use of While and For is most likely practiced wrongly.









    share







    New contributor




    Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.























      0












      0








      0







      So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.



      from requests import get
      from bs4 import BeautifulSoup
      from pprint import pprint
      import pandas as pd
      from time import sleep


      url = 'https://forum.lowyat.net/ReviewsandGuides'

      list_topic =
      list_description =
      list_replies =
      list_topicStarted =
      list_totalViews =


      def getContentFromURL(_url):
      try:
      response = get(_url)
      html_soup = BeautifulSoup(response.text, 'lxml')
      return html_soup
      except Exception as e:
      print('Error.getContentFromURL:', e)
      return None


      def iterateThroughPages(_lastindexpost, _postperpage, _url):
      indices = '/+'
      index = 0
      for i in range(index, _lastindexpost):
      print('Getting data from ' + url)
      try:
      extractDataFromRow1(getContentFromURL(_url))
      extractDataFromRow2(getContentFromURL(_url))
      print('current page index is: ' + str(index))
      print(_url)
      while i <= _lastindexpost:
      for table in get(_url):
      if table != None:
      new_getPostPerPage = i + _postperpage
      newlink = f'{url}{indices}{new_getPostPerPage}'
      print(newlink)
      bs_link = getContentFromURL(newlink)
      extractDataFromRow1(bs_link)
      extractDataFromRow2(bs_link)
      # threading to prevent spam. Waits 0.5 secs before executing
      sleep(0.5)
      i += _postperpage
      print('current page index is: ' + str(i))
      if i > _lastindexpost:
      # If i gets more than the input page(etc 1770) halts
      print('No more available post to retrieve')
      return
      except Exception as e:
      print('Error.iterateThroughPages:', e)
      return None


      def extractDataFromRow1(_url):
      try:
      for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
      # get data from topic title in table cell
      topic = container.select_one(
      'a[href^="/topic/"]').text.replace("n", "")
      description = container.select_one(
      'div.desc').text.replace("n", "")
      if topic or description is not None:
      dict_topic = topic
      dict_description = description
      if dict_description is '':
      dict_description = 'No Data'
      # list_description.append(dict_description)
      #so no empty string#
      list_topic.append(dict_topic)
      list_description.append(dict_description)
      else:
      None
      except Exception as e:
      print('Error.extractDataFromRow1:', e)
      return None


      def extractDataFromRow2(_url):
      try:
      for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
      replies = container.select_one('td:nth-of-type(4)').text.strip()
      topic_started = container.select_one(
      'td:nth-of-type(5)').text.strip()
      total_views = container.select_one(
      'td:nth-of-type(6)').text.strip()
      if replies or topic_started or total_views is not None:
      dict_replies = replies
      dict_topicStarted = topic_started
      dict_totalViews = total_views
      if dict_replies is '':
      dict_replies = 'No Data'
      elif dict_topicStarted is '':
      dict_topicStarted = 'No Data'
      elif dict_totalViews is '':
      dict_totalViews = 'No Data'
      list_replies.append(dict_replies)
      list_topicStarted.append(dict_topicStarted)
      list_totalViews.append(dict_totalViews)
      else:
      print('no data')
      None
      except Exception as e:
      print('Error.extractDataFromRow2:', e)
      return None


      # limit to 1740
      print(iterateThroughPages(1740, 30, url))
      new_panda = pd.DataFrame(
      {'Title': list_topic, 'Description': list_description,
      'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
      print(new_panda)


      I'm sure the use of my try is redundant at this point as well, my large variety of List including, and the use of While and For is most likely practiced wrongly.









      share







      New contributor




      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      So I've been experimenting various way to get data from different variety of website; as such, between the use of JSON or BeautifulSoup. Currently, I have written a scrapper but it pretty much has no reusable code. I've been figuring out how to correct my approach of appending data to one singular list for simplicity and reusability. But I've pretty much hit a stone with my current capability.



      from requests import get
      from bs4 import BeautifulSoup
      from pprint import pprint
      import pandas as pd
      from time import sleep


      url = 'https://forum.lowyat.net/ReviewsandGuides'

      list_topic =
      list_description =
      list_replies =
      list_topicStarted =
      list_totalViews =


      def getContentFromURL(_url):
      try:
      response = get(_url)
      html_soup = BeautifulSoup(response.text, 'lxml')
      return html_soup
      except Exception as e:
      print('Error.getContentFromURL:', e)
      return None


      def iterateThroughPages(_lastindexpost, _postperpage, _url):
      indices = '/+'
      index = 0
      for i in range(index, _lastindexpost):
      print('Getting data from ' + url)
      try:
      extractDataFromRow1(getContentFromURL(_url))
      extractDataFromRow2(getContentFromURL(_url))
      print('current page index is: ' + str(index))
      print(_url)
      while i <= _lastindexpost:
      for table in get(_url):
      if table != None:
      new_getPostPerPage = i + _postperpage
      newlink = f'{url}{indices}{new_getPostPerPage}'
      print(newlink)
      bs_link = getContentFromURL(newlink)
      extractDataFromRow1(bs_link)
      extractDataFromRow2(bs_link)
      # threading to prevent spam. Waits 0.5 secs before executing
      sleep(0.5)
      i += _postperpage
      print('current page index is: ' + str(i))
      if i > _lastindexpost:
      # If i gets more than the input page(etc 1770) halts
      print('No more available post to retrieve')
      return
      except Exception as e:
      print('Error.iterateThroughPages:', e)
      return None


      def extractDataFromRow1(_url):
      try:
      for container in _url.find_all('td', {'class': 'row1', 'valign': 'middle'}):
      # get data from topic title in table cell
      topic = container.select_one(
      'a[href^="/topic/"]').text.replace("n", "")
      description = container.select_one(
      'div.desc').text.replace("n", "")
      if topic or description is not None:
      dict_topic = topic
      dict_description = description
      if dict_description is '':
      dict_description = 'No Data'
      # list_description.append(dict_description)
      #so no empty string#
      list_topic.append(dict_topic)
      list_description.append(dict_description)
      else:
      None
      except Exception as e:
      print('Error.extractDataFromRow1:', e)
      return None


      def extractDataFromRow2(_url):
      try:
      for container in _url.select('table[cellspacing="1"] > tr')[2:32]:
      replies = container.select_one('td:nth-of-type(4)').text.strip()
      topic_started = container.select_one(
      'td:nth-of-type(5)').text.strip()
      total_views = container.select_one(
      'td:nth-of-type(6)').text.strip()
      if replies or topic_started or total_views is not None:
      dict_replies = replies
      dict_topicStarted = topic_started
      dict_totalViews = total_views
      if dict_replies is '':
      dict_replies = 'No Data'
      elif dict_topicStarted is '':
      dict_topicStarted = 'No Data'
      elif dict_totalViews is '':
      dict_totalViews = 'No Data'
      list_replies.append(dict_replies)
      list_topicStarted.append(dict_topicStarted)
      list_totalViews.append(dict_totalViews)
      else:
      print('no data')
      None
      except Exception as e:
      print('Error.extractDataFromRow2:', e)
      return None


      # limit to 1740
      print(iterateThroughPages(1740, 30, url))
      new_panda = pd.DataFrame(
      {'Title': list_topic, 'Description': list_description,
      'Replies': list_replies, 'Topic Starter': list_topicStarted, 'Total Views': list_totalViews})
      print(new_panda)


      I'm sure the use of my try is redundant at this point as well, my large variety of List including, and the use of While and For is most likely practiced wrongly.







      python python-3.x web-scraping beautifulsoup





      share







      New contributor




      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.










      share







      New contributor




      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.








      share



      share






      New contributor




      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      asked 2 mins ago









      MinialMinial

      134




      134




      New contributor




      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.





      New contributor





      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






      Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });






          Minial is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211238%2fcleaner-way-of-appending-data-to-list-in-beautifulsoup%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          Minial is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          Minial is a new contributor. Be nice, and check out our Code of Conduct.













          Minial is a new contributor. Be nice, and check out our Code of Conduct.












          Minial is a new contributor. Be nice, and check out our Code of Conduct.
















          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.





          Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


          Please pay close attention to the following guidance:


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211238%2fcleaner-way-of-appending-data-to-list-in-beautifulsoup%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Morgemoulin

          Scott Moir

          Souastre