Saturday, June 13, 2020

Loading that pickle, what's going on?

tqdm is a handy little utility for showing how well some process is progressing (it works within python code, but you can also use it on any piped shell process).

Pickle is python's built-in serialization, which can take an awful long time on really large objects.  Unfortunately, pickle's only input is an open file.

So here's a little useful snippet for estimating how long it's going to take to deserialize that big python pickle file.

class TQDMBytesReader(object):
    """Show progress while reading from a file"""
    def __init__(self, fd, **kwargs):
        self.fd = fd
        from tqdm import tqdm
        self.tqdm = tqdm(**kwargs)
    def read(self, size=-1):
        bytes = self.fd.read(size)
        self.tqdm.update(len(bytes))
        return bytes
    def readline(self):
        bytes = self.fd.readline()
        self.tqdm.update(len(bytes))
        return bytes
    def __enter__(self):
        self.tqdm.__enter__()
        return self
    def __exit__(self, *args, **kwargs):
        return self.tqdm.__exit__(*args, **kwargs)
with open(filename, "rb") as fd, \
TQDMBytesReader(fd, desc=f"Loading 'pickle", unit="b",
total=os.path.getsize(filename) as reader:
obj = pickle.load(reader)

Here's the complement, although it's not terribly helpful since pickle serializes everything to memory and only then writes to disk:

class TQDMBytesWriter(object):
    """Show progress while writing to a file"""
    def __init__(self, fd, **kwargs):
        self.fd = fd
        from tqdm import tqdm
        self.tqdm = tqdm(**kwargs)
    def write(self, b):
        bytes_written = self.fd.write(b)
        self.tqdm.update(bytes_written or 0)
        return bytes_written
    def __enter__(self):
        self.tqdm.__enter__()
        return self
    def __exit__(self, *args, **kwargs):
        return self.tqdm.__exit__(*args, **kwargs)

No comments: