Source code for sisyphus.hash

import hashlib
from inspect import isclass, isfunction


def md5(obj):
    """
    :param obj:
    :rtype: str
    """
    return hashlib.md5(str(obj).encode()).hexdigest()


def int_hash(obj):
    """
    :param object obj:
    :rtype: int
    """
    h = hashlib.sha256(sis_hash_helper(obj)).digest()
    return int.from_bytes(h, byteorder='big', signed=False)


def short_hash(obj,
               length=12,
               chars='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'):
    """
    :param object obj:
    :param int length:
    :param str|T chars:
    :rtype: str|T
    """
    h = hashlib.sha256(sis_hash_helper(obj)).digest()
    h = int.from_bytes(h, byteorder='big', signed=False)
    ls = []
    for i in range(length):
        ls.append(chars[int(h % len(chars))])
        h = h // len(chars)
    return ''.join(ls)


def get_object_state(obj):
    """
    Export current object status

    Comment: Maybe obj.__reduce__() is a better idea? is it stable for hashing?
    """

    if hasattr(obj, '__getnewargs_ex__'):
        args = obj.__getnewargs_ex__()
    elif hasattr(obj, '__getnewargs__'):
        args = obj.__getnewargs__()
    else:
        args = None

    if hasattr(obj, '__sis_state__'):
        state = obj.__sis_state__()
    elif hasattr(obj, '__getstate__'):
        state = obj.__getstate__()
    elif hasattr(obj, '__dict__'):
        state = obj.__dict__
    elif hasattr(obj, '__slots__'):
        state = {k: getattr(obj, k) for k in obj.__slots__ if hasattr(obj, k)}
    else:
        assert args is not None, "Failed to get object state of: %s" % repr(obj)
        state = None

    if args is None:
        return state
    else:
        return args, state


def sis_hash_helper(obj):
    """
    Takes most object and tries to convert the current state into bytes.

    :param object obj:
    :rtype: bytes
    """

    # Store type to ensure it's unique
    byte_list = [type(obj).__qualname__.encode()]

    # Using type and not isinstance to avoid derived types
    if isinstance(obj, bytes):
        byte_list.append(obj)
    elif obj is None:
        pass
    elif type(obj) in (int, float, bool, str, complex):
        byte_list.append(repr(obj).encode())
    elif type(obj) in (list, tuple):
        byte_list += map(sis_hash_helper, obj)
    elif type(obj) in (set, frozenset):
        byte_list += sorted(map(sis_hash_helper, obj))
    elif isinstance(obj, dict):
        # sort items to ensure they are always in the same order
        byte_list += sorted(map(sis_hash_helper, obj.items()))
    elif isfunction(obj):
        # Handle functions
        # Not a nice way to check if the given function is a lambda function, but the best I found
        # assert not isinstance(lambda m: m, LambdaType) is true for all functions
        assert obj.__name__ != '<lambda>', "Hashing of lambda functions is not supported"
        byte_list.append(sis_hash_helper((obj.__module__, obj.__qualname__)))
    elif isclass(obj):
        byte_list.append(sis_hash_helper((obj.__module__, obj.__qualname__)))
    elif hasattr(obj, '_sis_hash'):
        # sis job or path object
        return obj._sis_hash()
    else:
        byte_list.append(sis_hash_helper(get_object_state(obj)))

    byte_str = b'(' + b', '.join(byte_list) + b')'
    if len(byte_str) > 4096:
        # hash long outputs to avoid arbitrary long return values. 4096 is just
        # picked because it looked good and not optimized,
        # it's most likely not that important.
        return hashlib.sha256(byte_str).digest()
    else:
        return byte_str