Lehelt lugemise koodi näide
Allikas: Lambda
#!/usr/local/bin/python
import sys
import os
import libxml2
import libxslt
from types import *
# these paths must be set
startPage="/home/tanel/Ms/Semriik/index.html"
extractorPath="/home/tanel/Ms/Semriik/extractrdfa.xsl"
outfile="/home/tanel/Ms/Semriik/data.xml"
# debug will give extra printout
debugFlag=0
def main():
#print 'content-type: text/html\n'
fulldata=[]
# -- read and parse extractor stylesheets --
styleDoc=libxml2.parseFile(extractorPath)
style=libxslt.parseStylesheetDoc(styleDoc)
# -- read start page triples --
url=startPage
filedoc=libxml2.htmlParseFile(url,None)
appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"})
triples=style.saveResultToString(appres)
filedata=parseTriplesXml(triples)
confdata=filedata
fulldata=fulldata+confdata
if debugFlag:
print "==== start page gave triplets ====\n"
showTriplets(confdata)
# -- loop over configuration data
handledorg=[]
handledurl=[]
for triple in confdata[1:]:
subject=triple.get('subject')
organisation=subject
if not organisation in handledorg:
if debugFlag:
print "\n==== handling org ",organisation," ====\n"
handledorg=handledorg+[organisation]
urllist=tripleSubjectValueList(confdata,subject,'er:infoleht')
if debugFlag:
print "found urls for org: ",urllist
while urllist:
url=urllist[0]
if not url in handledurl:
if debugFlag:
print "==== starting to process url ===="
print url
handledurl=handledurl+[url]
filedoc=libxml2.htmlParseFile(url,None)
appres=style.applyStylesheet(filedoc, {'filepath' : "'"+url+"'"})
triples=style.saveResultToString(appres)
filedata=parseTriplesXml(triples)
if debugFlag:
print "--- url gave initial triplets ---"
showTriplets(filedata)
internalurllist=tripleValueList(filedata,'er:infoleht')
if debugFlag:
print 'internal infoleht list: ',internalurllist
# here starts code for automatically adding extra data
subjects=tripleGetSubjects(filedata)
if debugFlag:
print 'internal subjects list: ',subjects
tmporg=tripleMakeForSubjects(subjects,'er:asutus',organisation)
department=tripleSubjectFirstValue(filedata,url,'er:osakond')
if department:
tmpdepartment=tripleMakeForSubjects(subjects,'er:osakond',department)
else:
tmpdepartment=[]
tmptype=tripleMakePropForSubjectsDomain(filedata,subjects,
'er:tyyp','tootaja',('er:amet','er:ametijuhend',))
# here ends code for automatically adding extra data
if debugFlag:
print "--- derived triplets for url ---"
showTriplets(tmporg)
showTriplets(tmpdepartment)
showTriplets(tmptype)
filedata=filedata+tmporg+tmpdepartment+tmptype
if debugFlag:
print "--- final triplets for url ---"
showTriplets(filedata)
fulldata=fulldata+filedata
urllist=urllist+internalurllist
urllist=urllist[1:]
style.freeStylesheet()
if debugFlag:
print "\n==== final full data ====\n"
showTriplets(fulldata)
xmlstr=tripletsToXml(fulldata)
if debugFlag:
print "\n==== final full data as xml string ====\n"
print xmlstr
handler=open(outfile,"w")
handler.write(xmlstr)
def tripleSubjectWithPredValueSet(triples,predicate,object):
res=[]
for el in triples:
if type(el)==DictType:
if (el.get('object')==object and
el.get('predicate')==predicate and
not el.get('subject') in res):
res=res+[el.get('subject')]
return res
def tripleSubjectHasValue(triples,subject,predicate,object):
for el in triples:
if type(el)==DictType:
if (el.get('subject')==subject and
el.get('predicate')==predicate and
el.get('object')==object):
return True
return False
def tripleSubjectValueList(triples,subject,predicate):
res=[]
for el in triples:
if type(el)==DictType:
if (el.get('subject')==subject and
el.get('predicate')==predicate):
res=res+[el.get('object')]
return res
def tripleSubjectFirstValue(triples,subject,predicate):
for el in triples:
if type(el)==DictType:
if (el.get('subject')==subject and
el.get('predicate')==predicate):
return el.get('object')
return ""
def tripleValueList(triples,predicate):
res=[]
for el in triples:
if type(el)==DictType:
if el.get('predicate')==predicate:
res=res+[el.get('object')]
return res
def tripleGetSubjects(triples):
res=[]
for el in triples:
if type(el)==DictType:
tmp=el.get('subject')
if not tmp in res:
res=res+[tmp]
return res
def tripleMakeForSubjects(subjects,predicate,object):
res=[]
for el in subjects:
tmp={'subject':el,'predicate':predicate,'object':object}
res=res+[tmp]
return res
# the following code adds derived data for certain kinds of subjects
def tripleMakePropForSubjectsDomain(filedata,subjects,newpred,newvalue,oldpreds):
res=[]
#elimsubjects=[]
# collect objects which already have some newpred as predicate
#for el in filedata:
# if type(el)==DictType:
# if el.get('predicate')==newpred:
# elimsubjects=elimsubjects+[el.get('subject')]
# loop over all objects
for el in filedata:
if type(el)==DictType:
if el.get('predicate') in oldpreds: # and not el in elimsubjects:
res=res+[{'subject':el.get('subject'), 'predicate':newpred, 'object':newvalue}]
return res
def showTriplets(triplets):
for el in triplets:
if type(el)==DictType:
print el.get('subject'),el.get('predicate'),el.get('object'),el.get('type')
def tripletsToXml(triplets):
res="""<triplets>"""
for el in triplets:
if type(el)==DictType:
tmp="""
<triplet>
<subject>%s</subject><predicate>%s</predicate><object>%s</object><type>%s</type>
</triplet>"""
typeval=el.get('type')
if not typeval:
typeval='xsd:string'
tmpres=(tmp % (xmlStrEnc(el.get('subject')),xmlStrEnc(el.get('predicate')),
xmlStrEnc(el.get('object')),xmlStrEnc(typeval),))
res=res+tmpres
return res+"""
</triplets>"""
def removeElem(el,list):
if el in list:
res=[]
for x in list:
if x!=el:
res=res+[x]
return res
else:
return list
def parseTriplesXml(xmlstr):
#print 'parseTriplesXml '
if not xmlstr:
return None
dom=libxml2.parseDoc(xmlstr)
root=dom.children
if not root or not root.children:
return None
triple=root.children
res=[]
while triple:
if triple.type=='element':
resel={}
el=triple.children
while el:
if el.type=='element':
#print str(el)
if el.children and el.children.type=='text':
#print el.name,el.content
resel[el.name]=xmlStrDec(el.content)
el=el.next
if resel:
res=res+[resel]
triple=triple.next
return res
def xmlStrDec(data):
if not data:
return ""
if data.find('<')>=0:
data=data.replace('<','<')
if data.find('>')>=0:
data=data.replace('>','>')
if data.find('&')>=0:
data=data.replace('&','&')
if data.find(''')>=0:
data=data.replace(''',"'")
if data.find('"')>=0:
data=data.replace('"','"')
return data
def xmlStrEnc(data):
if data.find('&')>=0:
data=data.replace('&','&')
if data.find('<')>=0:
data=data.replace('<','<')
if data.find('>')>=0:
data=data.replace('>','>')
return data
main()