User:Alex brollo/OpalToIA
This is my draft (but happily running) OpalLib.py script under itsource project. It grabs pdf files from Opal libri antichi] (double page scans), it crops and splits derived tiffs, it loads them into a zip ._images.zip and it uploads them into Internet Archive, joined to a simple set of metadata coming from Opal.
Presenty it is run from itsource shell by a command:
python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]
or by a jsub command:
jsub -sync y python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]
or by
./test.sh [opal_id] [IA S3 access key] [IA S3 secret key]
or simply, presently,
./opalnew.sh [IA S3 access key] [IA S3 secret key]
opalnew.sh being merely a list of individual jsub .... python....
internetarchive module is locally installed, and it runs into a virtualenv (itsource has been recently migrated into eqiad and I got the valuable suggestion about virtualenv to install locally by pip needed modules).
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os, Image, ImageChops, shutil, math, pickle,sys,anydbm
from urllib import FancyURLopener
import internetarchive
class MyOpener(FancyURLopener):
version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)'
opener=MyOpener()
def iaMetadata(id):
mtd=anydbm.open("public_html/opal.db")
meta=unicode(mtd[id+".pdf"],"utf-8").split("\t")
myMetadata={}
myMetadata['mediatype']='texts'
myMetadata['language']='ita'
myMetadata['licenseurl']='http://creativecommons.org/publicdomain/zero/1.0/'
myMetadata['description']='<br /><div>Scanned by Claudio Ruggeri, <a href="http://www.opal.unito.it" rel="nofollow">Opal Libri antichi</a>, University of Turin</div>'
myMetadata['collection']='opensource_media'
myMetadata['subject']='Italian theater;16th century'
myMetadata['creator']=nomeCognome(meta[0])
myMetadata['title']=meta[1]+" ("+nomeCognome(meta[0])+")"
myMetadata['year']=meta[7]
myMetadata['printer']=meta[6]
myMetadata['city']=meta[5]
myMetadata['description']=" ".join([meta[0]+".",meta[1]+".",meta[2]+".",meta[5]+":",meta[6]+",",meta[7]])+myMetadata['description']
mtd.close()
return myMetadata
def nomeCognome(autore):
nomecognome=autore
if "," in autore:
nomecognome=autore.split(",")[1].strip()+" "+autore.split(",")[0].strip()
return nomecognome
def grabOpal(id): # id è raccolta/nome del pdf senza estensione (es: teatro/image10)
idOpal=id.split("/")[1]
raccolta=id.split("/")[0]
if not raccolta in ["teatro","miscellanea","narrativa"]:
print "Errore parametri"
return
if raccolta=="teatro" or raccolta=="t":
base="http://www.opal.unito.it/psixsite/Teatro%20italiano%20del%20XVI%20e%20XVII%20secolo/Elenco%20opere/"
elif raccolta=="narrativa" or raccolta=="n":
base="http://www.opal.unito.it/psixsite/Narrativa%20italiana%20del%20Seicento%20(e%20dintorni)/Elenco%20opere/"
elif raccolta=="miscellanea" or raccolta=="m":
base="http://www.opal.unito.it/psixsite/Miscellanea%20di%20testi%20di%20genere%20diverso/Elenco%20opere/"
url=base+idOpal+".pdf"
idIA=idOpal+raccolta[0:1].upper()+raccolta[1:]+"Opal"
print "Grabbing..."
grab(url,idIA+".pdf") # grabbing pdf file from Opal and saving it as [idIA].pdf
print "Launching opal...."
opal(idIA,idOpal,raccolta,tipo="tiff") # launching the main routine to extract, split, zip and upload
return
def opal(id, idOpal, raccolta,tipo="tiff", taglio=True ):
separa(id+".pdf",tipo)
np=0
print "Inizio splitting"
tipoFile="jpg"
if tipo=="tiff":
tipoFile="tif"
if taglio:
tT(tipoFile) # splitting routine
os.remove("public_html/out2/pag-0000."+tipoFile) # deleting first empty page since frontespice must be the first image
zippa(id,taglio) # zipping and uploading
print "Inizio caricamento su Opal"
iaUpload(id,idOpal,raccolta)
print "Fatto"
return
def grab(url,output=None):
page=opener.open(url+"?action=render")
content=page.read()
if output==None:
output=url[url.rfind("/")+1:]
open(output,"wb").write(content)
print output
return "Fatto"
def carica_pcl(nome_file, folder=""):
nome_file=folder+nome_file+".pcl"
f=open(nome_file)
contenuto=pickle.load(f)
f.close()
return contenuto
def salva_pcl(variabile,nome_file="dato",folder=""):
nome_file=folder+nome_file+".pcl"
f=open(nome_file,"w")
pickle.dump(variabile, f)
f.close()
print "Variabile salvata nel file "+nome_file
return
##
# Crop borders off an image.
#
# @param im Source image.
# @param bgcolor Background color, using either a color tuple or
# a color name (1.1.4 only).
# @return An image without borders, or None if there's no actual
# content in the image.
def autocrop(im, bgcolor):
if im.mode != "RGB":
im = im.convert("RGB")
bg = Image.new("RGB", im.size, bgcolor)
diff = ImageChops.difference(im, bg)
bbox = diff.getbbox()
if bbox:
return im.crop(bbox)
return None # no contents
def separa(filepdf, tipo):
l=os.listdir("public_html/out")
for i in l:
os.remove("public_html/out/"+i)
l=os.listdir("public_html/out2")
for i in l:
os.remove("public_html/out2/"+i)
scriptBase='gs -sDEVICE=tiff24nc -r300x300 -sCompression=lzw -dNOPAUSE -dBATCH -sOutputFile="public_html/out/pag%04d.tif" '+filepdf
print scriptBase
os.system(scriptBase)
return
def zippa(IaId,taglio=True):
IaId=IaId.replace(".pdf","")
if taglio:
os.system("zip -r public_html/"+IaId+"_images.zip public_html/out2")
else:
os.system("zip -r public_html/"+IaId+"_images.zip public_html/out")
return
def calcola(x1,y1,m0,m1,m2,m3):
delta=(x1-m0-m2)*0.1
m0=m0-delta
m1=m1-delta
m2=m2-delta
m3=m3-delta
print m0/x1*100
print m1/y1*100
print m2/x1*100
print m3/y1*100
return
def tT(tipoFile="tif"): # splits images of /out into /out2
lista=[]
n=1
while os.path.isfile("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile):
lista.append("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile)
n+=1
for i in range(len(lista)):
jpg0=Image.open(lista[i])
jpg0=autocrop(jpg0,(255,255,255))
xy0=jpg0.size[0] #larghezza
xy1=jpg0.size[1] # altezza
# creating the left page
jpg1=jpg0.crop((0,0,int(xy0*0.5),xy1))
jpg1.save("public_html/out2/pag-"+f0(i*2)+"."+tipoFile)
#print "public_html/out2/pag-"+f0(i*2)+"."+tipoFile,
#creatibg the right page
jpg2=jpg0.crop((int(xy0*0.5),0,xy0,xy1))
jpg2.save("public_html/out2/pag-"+f0(i*2+1)+"."+tipoFile)
#print "out2/pag-"+f0(i*2+1)+"."+tipoFile
return
def f0(n,w=4):
n="0000"+str(n)
n=n[-w:]
return n
def iaUpload(iaId,idOpal,raccolta,test=False):
# iaId=iaId.replace(".pdf","")
#print "Documentazione: https://pypi.python.org/pypi/internetarchive"
#print "https://archive.org/account/s3.php"
metadati=iaMetadata(raccolta+":"+idOpal)
item=internetarchive.Item(iaId)
print "idOpal: ",idOpal
print "File zip: ","public_html/"+iaId+"_images.zip"
print iaId
#for i in metadati:
# print i, metadati[i]
if not test:
if not item.exists:
item.upload("public_html/"+iaId+"_images.zip", metadata=metadati)
os.system("zip -d sourceImages *")
os.system("zip -m sourceImages "+iaId+".pdf")
item.upload("sourceImages.zip")
else:
print "Item "+iaId+" already exists"
else:
print "Item "+iaId+" not uploaded (run test)"
return
def main():
if len(sys.argv)>=4:
idOpal=sys.argv[1]
#idIa=sys.argv[2]
os.environ['AWS_ACCESS_KEY_ID']=sys.argv[2]
os.environ['AWS_SECRET_ACCESS_KEY']=sys.argv[3]
print "Id: ",idOpal
grabOpal(idOpal)
else:
print "Parametri insufficienti"
return
if __name__ == "__main__":
main()
Launching virtualenv
Thanks to MZMcBride, who suggested this rows of code, I have no idea of what they do, but running them pip misteriously runs and internetarchive (and PIL too!) have been happily installed into itsource login-eqiad and dev-eqiad:
virtualenv --no-site-packages env echo "source $HOME/env/bin/activate" >> .bashrc source $HOME/env/bin/activate pip install internetarchive
As I "become itsource" after any login I run simply:
source $HOME/env/bin/activate
and things run. :-) --Alex brollo (talk) 22:36, 8 March 2014 (UTC)