import xml.etree.ElementTree as ET from xml.etree.ElementTree import XML from StringIO import StringIO import urllib, urllib2, gzip, re query = "HIV" searchUrl = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' fetchUrl = 'ftp://ftp-private.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Description/%s.descr.xml.gz' params = urllib.urlencode({'db':'pcassay', 'retmode':'xml', 'term':query}) req = urllib2.Request(searchUrl, params) root = XML(urllib2.urlopen(req).read()) ## get the assay ID's ids = [x.text for x in root.findall('IdList/Id')] def addns(path, ns): path = re.sub('/', '/{%s}' % (ns), path) path = '{%s}%s' % (ns,path) return path ## get the assay documents def getProteinID(aid): req = urllib2.Request(fetchUrl % (aid)) resp = urllib2.urlopen(req).read() root = XML(gzip.GzipFile('', 'r', 0, StringIO(resp)).read()) path = addns('PC-AssaySubmit/PC-AssaySubmit_assay/PC-AssaySubmit_assay_descr/PC-AssayDescription/PC-AssayDescription_target/PC-AssayTargetInfo/PC-AssayTargetInfo_mol-id', 'http://www.ncbi.nlm.nih.gov') pid = root.find(path) if pid is not None: return pid.text ## get the protein ID's pids = [getProteinID(x) for x in ids] print pids