User:Alex brollo/OpalToIA

This is my draft (but happily running) OpalLib.py script under itsource project. It grabs pdf files from Opal libri antichi] (double page scans), it crops and splits derived tiffs, it loads them into a zip ._images.zip and it uploads them into Internet Archive, joined to a simple set of metadata coming from Opal.

Presenty it is run from itsource shell by a command:

python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]

or by a jsub command:

jsub -sync y python OpalLib.py [opal_id] [IA S3 access key] [IA S3 secret key]

or by

./test.sh [opal_id] [IA S3 access key] [IA S3 secret key]

or simply, presently,

./opalnew.sh [IA S3 access key] [IA S3 secret key]

opalnew.sh being merely a list of individual jsub .... python....

internetarchive module is locally installed, and it runs into a virtualenv (itsource has been recently migrated into eqiad and I got the valuable suggestion about virtualenv to install locally by pip needed modules).

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os, Image, ImageChops, shutil, math, pickle,sys,anydbm
from urllib import FancyURLopener
import internetarchive

class MyOpener(FancyURLopener):
    version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)'

opener=MyOpener()


def iaMetadata(id):
    mtd=anydbm.open("public_html/opal.db")
    meta=unicode(mtd[id+".pdf"],"utf-8").split("\t")
    myMetadata={}
    myMetadata['mediatype']='texts'
    myMetadata['language']='ita'
    myMetadata['licenseurl']='http://creativecommons.org/publicdomain/zero/1.0/'
    myMetadata['description']='<br /><div>Scanned by Claudio Ruggeri, <a href="http://www.opal.unito.it" rel="nofollow">Opal Libri antichi</a>, University of Turin</div>'
    myMetadata['collection']='opensource_media'
    myMetadata['subject']='Italian theater;16th century'
    myMetadata['creator']=nomeCognome(meta[0])
    myMetadata['title']=meta[1]+" ("+nomeCognome(meta[0])+")"
    myMetadata['year']=meta[7]
    myMetadata['printer']=meta[6]
    myMetadata['city']=meta[5]
    myMetadata['description']=" ".join([meta[0]+".",meta[1]+".",meta[2]+".",meta[5]+":",meta[6]+",",meta[7]])+myMetadata['description']
    mtd.close()
    return myMetadata

def nomeCognome(autore):
    nomecognome=autore
    if "," in autore:
        nomecognome=autore.split(",")[1].strip()+" "+autore.split(",")[0].strip()
    return nomecognome
    

def grabOpal(id): # id è raccolta/nome del pdf senza estensione (es: teatro/image10)
    
    idOpal=id.split("/")[1]
    raccolta=id.split("/")[0]
    if not raccolta in ["teatro","miscellanea","narrativa"]:
        print "Errore parametri"
        return
    if raccolta=="teatro" or raccolta=="t":
        base="http://www.opal.unito.it/psixsite/Teatro%20italiano%20del%20XVI%20e%20XVII%20secolo/Elenco%20opere/"
    elif raccolta=="narrativa" or raccolta=="n":
        base="http://www.opal.unito.it/psixsite/Narrativa%20italiana%20del%20Seicento%20(e%20dintorni)/Elenco%20opere/"
    elif raccolta=="miscellanea" or raccolta=="m":
        base="http://www.opal.unito.it/psixsite/Miscellanea%20di%20testi%20di%20genere%20diverso/Elenco%20opere/"
    url=base+idOpal+".pdf"
    idIA=idOpal+raccolta[0:1].upper()+raccolta[1:]+"Opal"   
    print "Grabbing..."
    grab(url,idIA+".pdf")   # grabbing pdf file from Opal and saving it as [idIA].pdf
    print "Launching opal...."
    opal(idIA,idOpal,raccolta,tipo="tiff")  # launching the main routine to extract, split, zip and upload 
    return 

def opal(id, idOpal, raccolta,tipo="tiff", taglio=True ):
    separa(id+".pdf",tipo)
    
    np=0
    print "Inizio splitting"
    tipoFile="jpg"
    if tipo=="tiff":
        tipoFile="tif"
    if taglio: 
        tT(tipoFile) # splitting routine
    os.remove("public_html/out2/pag-0000."+tipoFile) # deleting first empty page since frontespice must be the first image
    zippa(id,taglio)                             # zipping and uploading
    
    print "Inizio caricamento su Opal"
    iaUpload(id,idOpal,raccolta)
    print "Fatto"
    return
    
def grab(url,output=None):
    page=opener.open(url+"?action=render")
    content=page.read()
    if output==None:
        output=url[url.rfind("/")+1:]
    open(output,"wb").write(content)
    print output
    return "Fatto"

def carica_pcl(nome_file, folder=""):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder=""):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return


##
# Crop borders off an image.
#
# @param im Source image.
# @param bgcolor Background color, using either a color tuple or
#     a color name (1.1.4 only).
# @return An image without borders, or None if there's no actual
#     content in the image.

def autocrop(im, bgcolor):
    if im.mode != "RGB":
        im = im.convert("RGB")
    bg = Image.new("RGB", im.size, bgcolor)
    diff = ImageChops.difference(im, bg)
    bbox = diff.getbbox()
    if bbox:
        return im.crop(bbox)
    return None # no contents


def separa(filepdf, tipo):
    l=os.listdir("public_html/out")
    for i in l:
        os.remove("public_html/out/"+i)
    l=os.listdir("public_html/out2")
    for i in l:
        os.remove("public_html/out2/"+i)
    
    scriptBase='gs -sDEVICE=tiff24nc -r300x300 -sCompression=lzw -dNOPAUSE -dBATCH -sOutputFile="public_html/out/pag%04d.tif" '+filepdf
    print scriptBase
    os.system(scriptBase)
    return

def zippa(IaId,taglio=True):
    IaId=IaId.replace(".pdf","")
    if taglio:
        os.system("zip -r public_html/"+IaId+"_images.zip public_html/out2")
    else:
        os.system("zip -r public_html/"+IaId+"_images.zip public_html/out")

    return
    
def calcola(x1,y1,m0,m1,m2,m3):
    delta=(x1-m0-m2)*0.1
    m0=m0-delta
    m1=m1-delta
    m2=m2-delta
    m3=m3-delta
    print m0/x1*100
    print m1/y1*100
    print m2/x1*100
    print m3/y1*100
    return
   
def tT(tipoFile="tif"): #  splits images of /out into /out2
    lista=[]
    n=1
    while os.path.isfile("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile):
        lista.append("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile)
        n+=1

    for i in range(len(lista)):
        jpg0=Image.open(lista[i])
        jpg0=autocrop(jpg0,(255,255,255))
        xy0=jpg0.size[0] #larghezza
        xy1=jpg0.size[1] # altezza
		# creating the left page
             
        jpg1=jpg0.crop((0,0,int(xy0*0.5),xy1))
        jpg1.save("public_html/out2/pag-"+f0(i*2)+"."+tipoFile)
        #print "public_html/out2/pag-"+f0(i*2)+"."+tipoFile,
        
		#creatibg the right page
        jpg2=jpg0.crop((int(xy0*0.5),0,xy0,xy1))
        jpg2.save("public_html/out2/pag-"+f0(i*2+1)+"."+tipoFile)
        #print "out2/pag-"+f0(i*2+1)+"."+tipoFile
    return

def f0(n,w=4):
    n="0000"+str(n)
    n=n[-w:]
    return n


def iaUpload(iaId,idOpal,raccolta,test=False):
    # iaId=iaId.replace(".pdf","")
    #print "Documentazione: https://pypi.python.org/pypi/internetarchive"
    #print "https://archive.org/account/s3.php"
    metadati=iaMetadata(raccolta+":"+idOpal)
    

    item=internetarchive.Item(iaId)
    print "idOpal: ",idOpal
    print "File zip: ","public_html/"+iaId+"_images.zip"
    print iaId
    #for i in metadati:
    #    print i, metadati[i]
    if not test:
        if not item.exists:
            item.upload("public_html/"+iaId+"_images.zip", metadata=metadati)
            os.system("zip -d sourceImages *")
            os.system("zip -m sourceImages "+iaId+".pdf")
            item.upload("sourceImages.zip") 
            
        else:
            print "Item "+iaId+" already exists"
    else:
        print "Item "+iaId+" not uploaded (run test)"      
    return 

def main():
    if len(sys.argv)>=4:
        idOpal=sys.argv[1]
        #idIa=sys.argv[2]
        os.environ['AWS_ACCESS_KEY_ID']=sys.argv[2]
        os.environ['AWS_SECRET_ACCESS_KEY']=sys.argv[3]
        print "Id: ",idOpal
        grabOpal(idOpal)
    else:
        print "Parametri insufficienti"
    
    return

if __name__ == "__main__":
    main()

Launching virtualenv

Thanks to MZMcBride, who suggested this rows of code, I have no idea of what they do, but running them pip misteriously runs and internetarchive (and PIL too!) have been happily installed into itsource login-eqiad and dev-eqiad:

virtualenv --no-site-packages env
echo "source $HOME/env/bin/activate" >> .bashrc
source $HOME/env/bin/activate
pip install internetarchive

As I "become itsource" after any login I run simply:

source $HOME/env/bin/activate

and things run. :-) --Alex brollo (talk) 22:36, 8 March 2014 (UTC)