Scraping Clinical Studies database (via ClinicalTrials.Gov)
I have written the following script to scrape data from the U.S. National Library of Medicine website ClinicalTrials.Gov based on an NCTID.
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = dict((str('ct' + tag_matches[i].name.capitalize()), tag_matches[i].text) for i in range(0, len(tag_matches)))
tag_dict = multipleFields(data, ['intervention_name'], tag_dict)
tag_dict = multipleFields(data, ['intervention_type'], tag_dict)
tag_dict = multipleFields(data, ['arm_group_type'], tag_dict)
tag_dict['ctID'] = nctid
#for key in tag_dict:
#print(key + ': ' + tag_dict[key])
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field =
try:
for each in fields:
field.append(each.text)
tagDict[str('ct' + subset[0].capitalize())] = ", ".join(field)
return tagDict
except:
return tagDict
def removeEmptyKeys (dict1):
newDict = {}
for key in dict1:
if str(dict1[key]) is not '':
newDict[key] = dict1[key]
return newDict
What can I do to make this process more efficent?
python python-3.x web-scraping dictionary beautifulsoup
add a comment |
I have written the following script to scrape data from the U.S. National Library of Medicine website ClinicalTrials.Gov based on an NCTID.
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = dict((str('ct' + tag_matches[i].name.capitalize()), tag_matches[i].text) for i in range(0, len(tag_matches)))
tag_dict = multipleFields(data, ['intervention_name'], tag_dict)
tag_dict = multipleFields(data, ['intervention_type'], tag_dict)
tag_dict = multipleFields(data, ['arm_group_type'], tag_dict)
tag_dict['ctID'] = nctid
#for key in tag_dict:
#print(key + ': ' + tag_dict[key])
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field =
try:
for each in fields:
field.append(each.text)
tagDict[str('ct' + subset[0].capitalize())] = ", ".join(field)
return tagDict
except:
return tagDict
def removeEmptyKeys (dict1):
newDict = {}
for key in dict1:
if str(dict1[key]) is not '':
newDict[key] = dict1[key]
return newDict
What can I do to make this process more efficent?
python python-3.x web-scraping dictionary beautifulsoup
How many NCTIDs are you planning on scraping? If you need more than one you should look intorequests.Session
and possiblyscrapy
.
– Graipher
May 15 at 16:15
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
1
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11
add a comment |
I have written the following script to scrape data from the U.S. National Library of Medicine website ClinicalTrials.Gov based on an NCTID.
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = dict((str('ct' + tag_matches[i].name.capitalize()), tag_matches[i].text) for i in range(0, len(tag_matches)))
tag_dict = multipleFields(data, ['intervention_name'], tag_dict)
tag_dict = multipleFields(data, ['intervention_type'], tag_dict)
tag_dict = multipleFields(data, ['arm_group_type'], tag_dict)
tag_dict['ctID'] = nctid
#for key in tag_dict:
#print(key + ': ' + tag_dict[key])
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field =
try:
for each in fields:
field.append(each.text)
tagDict[str('ct' + subset[0].capitalize())] = ", ".join(field)
return tagDict
except:
return tagDict
def removeEmptyKeys (dict1):
newDict = {}
for key in dict1:
if str(dict1[key]) is not '':
newDict[key] = dict1[key]
return newDict
What can I do to make this process more efficent?
python python-3.x web-scraping dictionary beautifulsoup
I have written the following script to scrape data from the U.S. National Library of Medicine website ClinicalTrials.Gov based on an NCTID.
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = dict((str('ct' + tag_matches[i].name.capitalize()), tag_matches[i].text) for i in range(0, len(tag_matches)))
tag_dict = multipleFields(data, ['intervention_name'], tag_dict)
tag_dict = multipleFields(data, ['intervention_type'], tag_dict)
tag_dict = multipleFields(data, ['arm_group_type'], tag_dict)
tag_dict['ctID'] = nctid
#for key in tag_dict:
#print(key + ': ' + tag_dict[key])
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field =
try:
for each in fields:
field.append(each.text)
tagDict[str('ct' + subset[0].capitalize())] = ", ".join(field)
return tagDict
except:
return tagDict
def removeEmptyKeys (dict1):
newDict = {}
for key in dict1:
if str(dict1[key]) is not '':
newDict[key] = dict1[key]
return newDict
What can I do to make this process more efficent?
python python-3.x web-scraping dictionary beautifulsoup
python python-3.x web-scraping dictionary beautifulsoup
edited May 15 at 15:46
Sᴀᴍ Onᴇᴌᴀ
8,33161853
8,33161853
asked May 15 at 14:40
Bob
361
361
How many NCTIDs are you planning on scraping? If you need more than one you should look intorequests.Session
and possiblyscrapy
.
– Graipher
May 15 at 16:15
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
1
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11
add a comment |
How many NCTIDs are you planning on scraping? If you need more than one you should look intorequests.Session
and possiblyscrapy
.
– Graipher
May 15 at 16:15
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
1
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11
How many NCTIDs are you planning on scraping? If you need more than one you should look into
requests.Session
and possibly scrapy
.– Graipher
May 15 at 16:15
How many NCTIDs are you planning on scraping? If you need more than one you should look into
requests.Session
and possibly scrapy
.– Graipher
May 15 at 16:15
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
1
1
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11
add a comment |
3 Answers
3
active
oldest
votes
I hope I'm not too late.
There are a few things you could do :
Solution 1
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- I have used a dictionary comprehension to define
tag_dict
andnewDict
. This is similar to a list comprehension or a generator expression but specialized for dictionaries - I have removed the
try … except
frommultipleFields
because I don't see in which case an exception will be raised (especially since you didn't specify which one you were trying to catch) - I have presumed that
subset
inmultipleFields()
is a string and not a list of strings since you were looking only for one tag - I have used a list comprehension to define
field
inmultipleFields()
- I have used the
pprint
module to see better the answer.
New contributor
add a comment |
But we can go further:
Solution 2
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of looking for almost all tags at the same time, I use a dictionary comprehension to look for each tag seperately and creating a list containing the text retrieved linked to the tag. This works if there's zero, one or many matching tags.
- I created another dictionary (
result_data
) to merge the answers (if there's more than one) and filter out the tags that don't have text associated with them.
New contributor
add a comment |
I have looked at the xml data coming in and I noticed that, for example, 'primary_outcome' and 'secondary_outcome' includes other tags ('measure', 'timeframe' and 'description'). Maybe you need all the information in tags but if you needed to retreive only the 'measure' for these tags you could do something like this:
Solution 3
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of using
.find_all()
I use.select()
which enables us to use a CSS expression to the tag we want
You could in fact generalize this situation for something else than 'measure':
Solution 4
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
New contributor
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194465%2fscraping-clinical-studies-database-via-clinicaltrials-gov%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
3 Answers
3
active
oldest
votes
3 Answers
3
active
oldest
votes
active
oldest
votes
active
oldest
votes
I hope I'm not too late.
There are a few things you could do :
Solution 1
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- I have used a dictionary comprehension to define
tag_dict
andnewDict
. This is similar to a list comprehension or a generator expression but specialized for dictionaries - I have removed the
try … except
frommultipleFields
because I don't see in which case an exception will be raised (especially since you didn't specify which one you were trying to catch) - I have presumed that
subset
inmultipleFields()
is a string and not a list of strings since you were looking only for one tag - I have used a list comprehension to define
field
inmultipleFields()
- I have used the
pprint
module to see better the answer.
New contributor
add a comment |
I hope I'm not too late.
There are a few things you could do :
Solution 1
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- I have used a dictionary comprehension to define
tag_dict
andnewDict
. This is similar to a list comprehension or a generator expression but specialized for dictionaries - I have removed the
try … except
frommultipleFields
because I don't see in which case an exception will be raised (especially since you didn't specify which one you were trying to catch) - I have presumed that
subset
inmultipleFields()
is a string and not a list of strings since you were looking only for one tag - I have used a list comprehension to define
field
inmultipleFields()
- I have used the
pprint
module to see better the answer.
New contributor
add a comment |
I hope I'm not too late.
There are a few things you could do :
Solution 1
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- I have used a dictionary comprehension to define
tag_dict
andnewDict
. This is similar to a list comprehension or a generator expression but specialized for dictionaries - I have removed the
try … except
frommultipleFields
because I don't see in which case an exception will be raised (especially since you didn't specify which one you were trying to catch) - I have presumed that
subset
inmultipleFields()
is a string and not a list of strings since you were looking only for one tag - I have used a list comprehension to define
field
inmultipleFields()
- I have used the
pprint
module to see better the answer.
New contributor
I hope I'm not too late.
There are a few things you could do :
Solution 1
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- I have used a dictionary comprehension to define
tag_dict
andnewDict
. This is similar to a list comprehension or a generator expression but specialized for dictionaries - I have removed the
try … except
frommultipleFields
because I don't see in which case an exception will be raised (especially since you didn't specify which one you were trying to catch) - I have presumed that
subset
inmultipleFields()
is a string and not a list of strings since you were looking only for one tag - I have used a list comprehension to define
field
inmultipleFields()
- I have used the
pprint
module to see better the answer.
New contributor
New contributor
answered 3 hours ago
EvensF
1011
1011
New contributor
New contributor
add a comment |
add a comment |
But we can go further:
Solution 2
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of looking for almost all tags at the same time, I use a dictionary comprehension to look for each tag seperately and creating a list containing the text retrieved linked to the tag. This works if there's zero, one or many matching tags.
- I created another dictionary (
result_data
) to merge the answers (if there's more than one) and filter out the tags that don't have text associated with them.
New contributor
add a comment |
But we can go further:
Solution 2
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of looking for almost all tags at the same time, I use a dictionary comprehension to look for each tag seperately and creating a list containing the text retrieved linked to the tag. This works if there's zero, one or many matching tags.
- I created another dictionary (
result_data
) to merge the answers (if there's more than one) and filter out the tags that don't have text associated with them.
New contributor
add a comment |
But we can go further:
Solution 2
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of looking for almost all tags at the same time, I use a dictionary comprehension to look for each tag seperately and creating a list containing the text retrieved linked to the tag. This works if there's zero, one or many matching tags.
- I created another dictionary (
result_data
) to merge the answers (if there's more than one) and filter out the tags that don't have text associated with them.
New contributor
But we can go further:
Solution 2
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of looking for almost all tags at the same time, I use a dictionary comprehension to look for each tag seperately and creating a list containing the text retrieved linked to the tag. This works if there's zero, one or many matching tags.
- I created another dictionary (
result_data
) to merge the answers (if there's more than one) and filter out the tags that don't have text associated with them.
New contributor
New contributor
answered 3 hours ago
EvensF
1011
1011
New contributor
New contributor
add a comment |
add a comment |
I have looked at the xml data coming in and I noticed that, for example, 'primary_outcome' and 'secondary_outcome' includes other tags ('measure', 'timeframe' and 'description'). Maybe you need all the information in tags but if you needed to retreive only the 'measure' for these tags you could do something like this:
Solution 3
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of using
.find_all()
I use.select()
which enables us to use a CSS expression to the tag we want
You could in fact generalize this situation for something else than 'measure':
Solution 4
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
New contributor
add a comment |
I have looked at the xml data coming in and I noticed that, for example, 'primary_outcome' and 'secondary_outcome' includes other tags ('measure', 'timeframe' and 'description'). Maybe you need all the information in tags but if you needed to retreive only the 'measure' for these tags you could do something like this:
Solution 3
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of using
.find_all()
I use.select()
which enables us to use a CSS expression to the tag we want
You could in fact generalize this situation for something else than 'measure':
Solution 4
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
New contributor
add a comment |
I have looked at the xml data coming in and I noticed that, for example, 'primary_outcome' and 'secondary_outcome' includes other tags ('measure', 'timeframe' and 'description'). Maybe you need all the information in tags but if you needed to retreive only the 'measure' for these tags you could do something like this:
Solution 3
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of using
.find_all()
I use.select()
which enables us to use a CSS expression to the tag we want
You could in fact generalize this situation for something else than 'measure':
Solution 4
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
New contributor
I have looked at the xml data coming in and I noticed that, for example, 'primary_outcome' and 'secondary_outcome' includes other tags ('measure', 'timeframe' and 'description'). Maybe you need all the information in tags but if you needed to retreive only the 'measure' for these tags you could do something like this:
Solution 3
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
- Instead of using
.find_all()
I use.select()
which enables us to use a CSS expression to the tag we want
You could in fact generalize this situation for something else than 'measure':
Solution 4
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))
New contributor
New contributor
answered 2 hours ago
EvensF
1011
1011
New contributor
New contributor
add a comment |
add a comment |
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194465%2fscraping-clinical-studies-database-via-clinicaltrials-gov%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
How many NCTIDs are you planning on scraping? If you need more than one you should look into
requests.Session
and possiblyscrapy
.– Graipher
May 15 at 16:15
@Graipher I plan on scraping around 100,000 NCTIDs.
– jdoe
May 15 at 18:56
1
FYI the data on ClinicalTrials.gov is also available in a publicly available PostgreSQL database.
– Daniel McCracken
May 17 at 15:11