User:Alex brollo/OpalToIA
This is my draft (but happily running) OpalLib.py script under itsource project. It grabs pdf files from Opal libri antichi] (double page scans), it crops and splits derived tiffs, it loads them into a zip ._images.zip and it uploads them into Internet Archive, joined to a simple set of metadata coming from Opal.
Presenty it is run from itsource shell by a command:
python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]
or by a jsub command:
jsub -sync y python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]
or by
./test.sh [opal_id] [IA S3 access key] [IA S3 secret key]
or simply, presently,
./opalnew.sh [IA S3 access key] [IA S3 secret key]
opalnew.sh being merely a list of individual jsub .... python....
internetarchive module is locally installed, and it runs into a virtualenv (itsource has been recently migrated into eqiad and I got the valuable suggestion about virtualenv to install locally by pip needed modules).
#!/usr/bin/python # -*- coding: utf-8 -*- import os, Image, ImageChops, shutil, math, pickle,sys,anydbm from urllib import FancyURLopener import internetarchive class MyOpener(FancyURLopener): version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)' opener=MyOpener() def iaMetadata(id): mtd=anydbm.open("public_html/opal.db") meta=unicode(mtd[id+".pdf"],"utf-8").split("\t") myMetadata={} myMetadata['mediatype']='texts' myMetadata['language']='ita' myMetadata['licenseurl']='http://creativecommons.org/publicdomain/zero/1.0/' myMetadata['description']='<br /><div>Scanned by Claudio Ruggeri, <a href="http://www.opal.unito.it" rel="nofollow">Opal Libri antichi</a>, University of Turin</div>' myMetadata['collection']='opensource_media' myMetadata['subject']='Italian theater;16th century' myMetadata['creator']=nomeCognome(meta[0]) myMetadata['title']=meta[1]+" ("+nomeCognome(meta[0])+")" myMetadata['year']=meta[7] myMetadata['printer']=meta[6] myMetadata['city']=meta[5] myMetadata['description']=" ".join([meta[0]+".",meta[1]+".",meta[2]+".",meta[5]+":",meta[6]+",",meta[7]])+myMetadata['description'] mtd.close() return myMetadata def nomeCognome(autore): nomecognome=autore if "," in autore: nomecognome=autore.split(",")[1].strip()+" "+autore.split(",")[0].strip() return nomecognome def grabOpal(id): # id è raccolta/nome del pdf senza estensione (es: teatro/image10) idOpal=id.split("/")[1] raccolta=id.split("/")[0] if not raccolta in ["teatro","miscellanea","narrativa"]: print "Errore parametri" return if raccolta=="teatro" or raccolta=="t": base="http://www.opal.unito.it/psixsite/Teatro%20italiano%20del%20XVI%20e%20XVII%20secolo/Elenco%20opere/" elif raccolta=="narrativa" or raccolta=="n": base="http://www.opal.unito.it/psixsite/Narrativa%20italiana%20del%20Seicento%20(e%20dintorni)/Elenco%20opere/" elif raccolta=="miscellanea" or raccolta=="m": base="http://www.opal.unito.it/psixsite/Miscellanea%20di%20testi%20di%20genere%20diverso/Elenco%20opere/" url=base+idOpal+".pdf" idIA=idOpal+raccolta[0:1].upper()+raccolta[1:]+"Opal" print "Grabbing..." grab(url,idIA+".pdf") # grabbing pdf file from Opal and saving it as [idIA].pdf print "Launching opal...." opal(idIA,idOpal,raccolta,tipo="tiff") # launching the main routine to extract, split, zip and upload return def opal(id, idOpal, raccolta,tipo="tiff", taglio=True ): separa(id+".pdf",tipo) np=0 print "Inizio splitting" tipoFile="jpg" if tipo=="tiff": tipoFile="tif" if taglio: tT(tipoFile) # splitting routine os.remove("public_html/out2/pag-0000."+tipoFile) # deleting first empty page since frontespice must be the first image zippa(id,taglio) # zipping and uploading print "Inizio caricamento su Opal" iaUpload(id,idOpal,raccolta) print "Fatto" return def grab(url,output=None): page=opener.open(url+"?action=render") content=page.read() if output==None: output=url[url.rfind("/")+1:] open(output,"wb").write(content) print output return "Fatto" def carica_pcl(nome_file, folder=""): nome_file=folder+nome_file+".pcl" f=open(nome_file) contenuto=pickle.load(f) f.close() return contenuto def salva_pcl(variabile,nome_file="dato",folder=""): nome_file=folder+nome_file+".pcl" f=open(nome_file,"w") pickle.dump(variabile, f) f.close() print "Variabile salvata nel file "+nome_file return ## # Crop borders off an image. # # @param im Source image. # @param bgcolor Background color, using either a color tuple or # a color name (1.1.4 only). # @return An image without borders, or None if there's no actual # content in the image. def autocrop(im, bgcolor): if im.mode != "RGB": im = im.convert("RGB") bg = Image.new("RGB", im.size, bgcolor) diff = ImageChops.difference(im, bg) bbox = diff.getbbox() if bbox: return im.crop(bbox) return None # no contents def separa(filepdf, tipo): l=os.listdir("public_html/out") for i in l: os.remove("public_html/out/"+i) l=os.listdir("public_html/out2") for i in l: os.remove("public_html/out2/"+i) scriptBase='gs -sDEVICE=tiff24nc -r300x300 -sCompression=lzw -dNOPAUSE -dBATCH -sOutputFile="public_html/out/pag%04d.tif" '+filepdf print scriptBase os.system(scriptBase) return def zippa(IaId,taglio=True): IaId=IaId.replace(".pdf","") if taglio: os.system("zip -r public_html/"+IaId+"_images.zip public_html/out2") else: os.system("zip -r public_html/"+IaId+"_images.zip public_html/out") return def calcola(x1,y1,m0,m1,m2,m3): delta=(x1-m0-m2)*0.1 m0=m0-delta m1=m1-delta m2=m2-delta m3=m3-delta print m0/x1*100 print m1/y1*100 print m2/x1*100 print m3/y1*100 return def tT(tipoFile="tif"): # splits images of /out into /out2 lista=[] n=1 while os.path.isfile("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile): lista.append("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile) n+=1 for i in range(len(lista)): jpg0=Image.open(lista[i]) jpg0=autocrop(jpg0,(255,255,255)) xy0=jpg0.size[0] #larghezza xy1=jpg0.size[1] # altezza # creating the left page jpg1=jpg0.crop((0,0,int(xy0*0.5),xy1)) jpg1.save("public_html/out2/pag-"+f0(i*2)+"."+tipoFile) #print "public_html/out2/pag-"+f0(i*2)+"."+tipoFile, #creatibg the right page jpg2=jpg0.crop((int(xy0*0.5),0,xy0,xy1)) jpg2.save("public_html/out2/pag-"+f0(i*2+1)+"."+tipoFile) #print "out2/pag-"+f0(i*2+1)+"."+tipoFile return def f0(n,w=4): n="0000"+str(n) n=n[-w:] return n def iaUpload(iaId,idOpal,raccolta,test=False): # iaId=iaId.replace(".pdf","") #print "Documentazione: https://pypi.python.org/pypi/internetarchive" #print "https://archive.org/account/s3.php" metadati=iaMetadata(raccolta+":"+idOpal) item=internetarchive.Item(iaId) print "idOpal: ",idOpal print "File zip: ","public_html/"+iaId+"_images.zip" print iaId #for i in metadati: # print i, metadati[i] if not test: if not item.exists: item.upload("public_html/"+iaId+"_images.zip", metadata=metadati) os.system("zip -d sourceImages *") os.system("zip -m sourceImages "+iaId+".pdf") item.upload("sourceImages.zip") else: print "Item "+iaId+" already exists" else: print "Item "+iaId+" not uploaded (run test)" return def main(): if len(sys.argv)>=4: idOpal=sys.argv[1] #idIa=sys.argv[2] os.environ['AWS_ACCESS_KEY_ID']=sys.argv[2] os.environ['AWS_SECRET_ACCESS_KEY']=sys.argv[3] print "Id: ",idOpal grabOpal(idOpal) else: print "Parametri insufficienti" return if __name__ == "__main__": main()
Launching virtualenv
Thanks to MZMcBride, who suggested this rows of code, I have no idea of what they do, but running them pip misteriously runs and internetarchive (and PIL too!) have been happily installed into itsource login-eqiad and dev-eqiad:
virtualenv --no-site-packages env echo "source $HOME/env/bin/activate" >> .bashrc source $HOME/env/bin/activate pip install internetarchive
As I "become itsource" after any login I run simply:
source $HOME/env/bin/activate
and things run. :-) --Alex brollo (talk) 22:36, 8 March 2014 (UTC)