` Code Snippets - A local file cache for Amazon S3

A local file cache for amazon S3 using python and boto

Amazon web services provides S3, a way to store and retrieve objects from the cloud.

This snippet illustrates how to create a python based wrapper around S3 storage, focused on transparently persisting files. The python module, s3cache, in turn uses boto, a nifty open source package providing a python API to Amazon web services.

To use the cache, simply import module s3cache, and use s3cache.open in place of open.

You'll need to define environment variables with your AWS keys:

export AWS_ACCESS_KEY_ID=....
export AWS_SECRET_ACCESS_KEY=.....
		

The following code uses s3cache to open a file multiple times, first to truncate and write the file, then to append to it, and finally to read from it. Each time the file is closed, any changes are backed up to S3. After opening the file, it can be treated just like any other file like object.

persist.py
from s3cache import s3cache

s3cache.setVerbosity(True)
s3cache.setCaching(False)

f = s3cache.open("/abc/world.txt","w")
f.write("Hello")
f.close() 

f = s3cache.open("/abc/world.txt","a")
f.write(" World")
f.close() 

f2 = s3cache.open("/abc/world.txt","r") 
print f2.readline()
f2.close()

s3cache.remove("/abc/world.txt")

The next code snippet defines the s3cache class, which is based on the singleton pattern. The cache manages the exchange of data between S3 and the local filesystem. Note that at the end of the file s3cache.py, you'll need to customise the code to set your bucket name and the path to a local filesystem directory in which to keep local copies of the cached files.

s3cache.py
import boto
import os
from s3file import s3file

class s3cache(object):

    # singleton instance
    instance = None

    # initialize the cache
    def __init__(self,tmpdir,bucket_name):
        self.bucket_name = bucket_name
        self.conn = None
        self.bucket = None
        self.tmpdir = tmpdir
        self.verbosity = False
        self.caching = True
        s3cache.instance = self
        
    # ensure that a connection to s3 exists
    def connect(self):
        if self.conn == None:
            try:
                self.conn = boto.connect_s3()
                self.bucket = self.conn.create_bucket(self.bucket_name)
            except:
                raise "Error - cannot connect to S3"
            
    # write a message to the log (if verbosity is on)
    def log(self,msg):
        if self.verbosity:
            print msg
            
    # remove a file
    def removePath(self,path):
        self.connect()
        s3f = s3file(s3cache.instance,path)
        s3f.remove()
                        
    # set verbosity on/off (default=off)
    @staticmethod
    def setVerbosity(verbosity):
        s3cache.instance.verbosity = verbosity
        
    # set local file caching on/off (default=on)
    @staticmethod
    def setCaching(caching):
        s3cache.instance.caching = caching

    # open a file in the cache and return a file-like object
    @staticmethod
    def open(path,mode):
        s3cache.instance.connect()
        s3f = s3file(s3cache.instance,path)
        s3f.open(mode)
        return s3f
        
    @staticmethod
    def remove(path):
        return s3cache.instance.removePath(path)
        
# Configuration
#
# create the singleton instance        

# define the local cache directory
local_cache_directory = "/tmp"
s3_bucket_name = "mccarroll.net.test"

s3cache(local_cache_directory,s3_bucket_name)

The final snippet defines the s3file object - a file-like object which delegates most of its calls directly to a real file object to operate on a locally cached copy of the file. This is done by defining __getattr__ to direct method calls to a delegator object which invokes them on the file.

s3file.py
import boto
from boto.s3.key import Key
import os

#
# wrap a local file with code to copy the contents into
# and out of S3
#
class s3file(object):

    # create and open a file using the cache manager, file path and mode
    def __init__(self,mgr,path):
        self.mgr = mgr
        self.path = path
        self.mode = None     
        self.tmppath = os.path.join(mgr.tmpdir,self.mangle(path))
        
    def removeCache(self):
        # remove local cache copy
        if os.path.exists(self.tmppath):
            try:
                os.remove(self.tmppath)
                self.log("removed local cache file("+self.tmppath+")")
            except:
                self.log("problem removing local cache file("+self.tmppath+")")
                pass
        
    def remove(self):
        self.log("removing file")
        self.removeCache()    
        self.log("removing file from S3")
        k = Key(self.mgr.bucket)
        k.key=self.path
        try:
            k.delete()
        except:
            self.log("problem removing file")
        
    def open(self,mode):
        self.mode = mode
        if 'r' in self.mode or 'a' in self.mode:
            # opening an existing file, try to copy in from s3 if not in local cache
            self.log("trying to open existing file")
            use_local_copy = self.mgr.caching
            if use_local_copy:
                if not os.path.exists(self.tmppath):
                    self.log("not found in local cache, attempting to load from S3")
                    use_local_copy = False
            if not use_local_copy:
                try:
                    k = Key(self.mgr.bucket)
                    k.key=self.path
                    k.get_contents_to_filename(self.tmppath)
                    self.log("file located in S3, downloaded from S3 to cache")
                except:
                    self.log("file not found in S3, opening new empty file in local cache")
                    pass
            else:
                self.log("file found in local cache")
        else:
            self.log("opening new file in local cache for writing")    
        # open the local file
        self.log("opening local cache file("+self.tmppath+")")
        self.file = open(self.tmppath,self.mode)
        
    # mangle the original file path to replace separators with underscores
    # and double up existing underscores
    def mangle(self,path):
        mangled_path = ''
        for c in path:
            if c == '/':
                mangled_path += '_'
            elif c == '_':
                mangled_path += '__'
            else:
                mangled_path += c
        return mangled_path
        
    def __getattr__(self,name):
        return s3file.delegator(self.file,name)
        
    # utility class to delegate    
    # a call on this class to the local file 
    class delegator(object):
        
        def __init__(self,target,name):
            self.target = target
            self.name = name
            
        def __call__(self,*args,**kwargs):
            method = self.target.__class__.__dict__[self.name]
            oargs = [self.target]
            oargs += args
            return method(*oargs,**kwargs)
            
    # on closing the file, copy it back to s3 if it was opened for writing/appending  
    def close(self):
        self.log("closing local cache file("+self.tmppath+")")
        self.file.close()   
        if 'w' in self.mode or 'a' in self.mode:
            self.log("writing updated cache file contents to S3")
            try:
                k = Key(self.mgr.bucket)
                k.key=self.path
                k.set_contents_from_filename(self.tmppath)
                self.log("write complete")
            except:
                self.log("ERROR - write to S3 failed")
	        
    def log(self,msg):
        self.mgr.log("s3file("+self.path+"): "+msg)

Note that this simple example is not robust in the face of multiple threads accessing the cache, and opening multiple instances of the same file at the same time may cause problems (just as it would with local filesystem files).


 

Leave a comment

Anti-Spam Check
Comment