From cf932f82a0042f1c2cd6bc6f1097a924b7e0606a Mon Sep 17 00:00:00 2001
From: Dushyant Behl <dushyantbehl@users.noreply.github.com>
Date: Wed, 11 Sep 2024 14:38:36 +0530
Subject: [PATCH 1/2] Introduce a FIPS compatible hashing algorithm with
 variable size digest support for aim hashing library.

Signed-off-by: Dushyant Behl <dushyantbehl@users.noreply.github.com>
---
 aim/storage/hashing/hashing.py | 54 +++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/aim/storage/hashing/hashing.py b/aim/storage/hashing/hashing.py
index 1aaa7e52e5..0195831c26 100644
--- a/aim/storage/hashing/hashing.py
+++ b/aim/storage/hashing/hashing.py
@@ -7,6 +7,7 @@
   *  Our implementation are less prone to manually designed collisions.
 """
 
+import _hashlib
 import hashlib
 
 from typing import Tuple, Union
@@ -34,6 +35,44 @@
 _HASH_STR_SALT = encode_int64(7540324813251503183)
 _HASH_BYTES_SALT = encode_int64(-6836296829636613855)
 
+# Invoke hashlib algorithm based on the security mode.
+# In normal mode use the original blake2b based hashing.
+# If we in the restrictive FIPS mode, RHEL FIPS mode restricts
+# the hashlib functions like blake2 to use openssl blake2 implementations
+# which limit the parameters and hence doesn't allow to customise the digest size.
+# So we use shake_256 as alternative in FIPS mode which provides variable length
+# digest support and is an acceptable SHA-3 Algorithm.
+# This class writes a wrapper as the digest signature is different for both.
+class aim_hash_algorithm:
+    digest_size: int = _HASH_SIZE
+    salt: int
+    is_fips_mode_enabled: bool
+    hashlib_state: None
+
+    # Based on the FIPS mode choose between blake2b or shake_256 hash function
+    def _invoke_hashlib(self):
+        if not self.is_fips_mode_enabled:
+            return hashlib.blake2b(digest_size=self.digest_size, salt=self.salt)
+        else:
+            return hashlib.shake_256()
+
+    def __init__(self, digest_size = None, salt = None):
+        if digest_size:
+            self.digest_size = digest_size
+        self.salt = salt
+        self.is_fips_mode_enabled = True if _hashlib.get_fips_mode() == 1 else False
+        self.hashlib_state = self._invoke_hashlib()
+
+    def update(self, obj: bytes):
+        self.hashlib_state.update(obj)
+
+    def digest(self):
+        if not self.is_fips_mode_enabled:
+            # blake2 digest signature
+            return self.hashlib_state.digest()
+        else:
+            # shake_256 digest signature with variable length
+            return self.hashlib_state.digest(length=self.digest_size)
 
 def hash_none(obj: NoneType = None) -> int:
     """Hash None values."""
@@ -47,7 +86,8 @@ def hash_uniform(bad_hash):
     in real applications) craft / find such examples that `a != b` but
     `hash(a) == hash(b)`
     """
-    state = hashlib.blake2b(encode_int64(bad_hash), digest_size=_HASH_SIZE, salt=_HASH_UNIFORM_SALT)
+    state = aim_hash_algorithm(salt=_HASH_UNIFORM_SALT)
+    state.update(encode_int64(bad_hash))
     return decode_int64(state.digest())
 
 
@@ -75,17 +115,18 @@ def hash_bool(obj: bool) -> int:
 def hash_bytes(obj: bytes) -> int:
     """Hash an `bytes` buffer"""
     # We use `blake2b` to hash the `bytes` object
-    state = hashlib.blake2b(obj, digest_size=_HASH_SIZE, salt=_HASH_BYTES_SALT)
+    state = aim_hash_algorithm(salt=_HASH_BYTES_SALT)
+    state.update(obj)
     return decode_int64(state.digest())
 
-
 def hash_string(obj: str) -> int:
     """Hash an string object"""
     # Similar to `bytes`, we use `blake2b` to hash strings as well
     # First, we encode them to `utf-8` and then compute the hash
     # but *a different hash seed is provided* to make sure strings and their
     # utf-8 encoded blobs do not map to the same hash.
-    state = hashlib.blake2b(obj.encode('utf-8'), digest_size=_HASH_SIZE, salt=_HASH_STR_SALT)
+    state = aim_hash_algorithm(salt=_HASH_STR_SALT)
+    state.update(obj.encode('utf-8'))
     return decode_int64(state.digest())
 
 
@@ -95,11 +136,10 @@ def hash_array(obj: AimObjectArray) -> int:
     We do not take into account whether it is a `list` or `tuple`, so
     `hash([1, 2, ['x', 5]]) == hash((1, 2, ('x', 5)))`
     """
-    state = hashlib.blake2b(digest_size=_HASH_SIZE, salt=_HASH_ARRAY_SALT)
+    state = aim_hash_algorithm(salt=_HASH_ARRAY_SALT)
     for i in obj:
         piece_hash = hash_auto(i)
         state.update(encode_int64(piece_hash))
-
     return decode_int64(state.digest())
 
 
@@ -117,7 +157,7 @@ def hash_object(obj: AimObjectDict) -> int:
     The implementation does not take into account the order
     `hash({'a': 5, 'b': 7}) == hash({'b': 7, 'a': 5})`
     """
-    state = hashlib.blake2b(digest_size=_HASH_SIZE, salt=_HASH_OBJECT_SALT)
+    state = aim_hash_algorithm(salt=_HASH_OBJECT_SALT)
     # Here we use `key_cmp` to run over the object keys in an (meaningless but)
     # deterministic order.
     for key_val_tuple in sorted(obj.items(), key=key_cmp):

From 891cbd3b72adf29f0bf97698652dbebf94a593a6 Mon Sep 17 00:00:00 2001
From: Dushyant Behl <dushyantbehl@users.noreply.github.com>
Date: Thu, 12 Sep 2024 19:31:20 +0530
Subject: [PATCH 2/2] Fix ordering issues in test case across python versions.

Signed-off-by: Dushyant Behl <dushyantbehl@users.noreply.github.com>
---
 tests/api/test_run_images_api.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/api/test_run_images_api.py b/tests/api/test_run_images_api.py
index c85518216b..57f797cdf2 100644
--- a/tests/api/test_run_images_api.py
+++ b/tests/api/test_run_images_api.py
@@ -213,7 +213,7 @@ def tearDown(self) -> None:
     @parameterized.expand([(1,), (5,), (10,)])
     def test_images_uri_bulk_load_api(self, uri_count):
         # take random N URIs
-        uris = random.sample(self.uri_map.keys(), uri_count)
+        uris = random.sample(list(self.uri_map.keys()), uri_count)
 
         client = self.client
         response = client.post('/api/runs/images/get-batch', json=uris)
@@ -420,12 +420,12 @@ def test_run_info_get_all_sequences_api(self, qparams, trace_type_count):
         self.assertEqual('image_lists', response_data['traces']['images'][0]['name'])
         metrics_data = response_data['traces']['metric']
         self.assertEqual(3, len(metrics_data))
-        self.assertEqual('floats', metrics_data[0]['name'])
-        self.assertEqual('floats', metrics_data[1]['name'])
-        self.assertEqual('integers', metrics_data[2]['name'])
-        self.assertDictEqual({'subset': 'val'}, metrics_data[0]['context'])
-        self.assertDictEqual({'subset': 'train'}, metrics_data[1]['context'])
-        self.assertDictEqual({'subset': 'train'}, metrics_data[2]['context'])
+        contexts = []
+        for m in metrics_data:
+            contexts.append((m['context'], m['name']))
+        self.assertTrue(({'subset': 'val'}, 'floats') in contexts)
+        self.assertTrue(({'subset': 'train'}, 'floats') in contexts)
+        self.assertTrue(({'subset': 'train'}, 'integers') in contexts)
 
         response = client.get(f'api/runs/{self.run2_hash}/info', params={'sequence': ('images', 'metric')})
         self.assertEqual(200, response.status_code)
@@ -437,10 +437,11 @@ def test_run_info_get_all_sequences_api(self, qparams, trace_type_count):
         self.assertEqual('single_images', response_data['traces']['images'][0]['name'])
         metrics_data = response_data['traces']['metric']
         self.assertEqual(2, len(metrics_data))
-        self.assertEqual('floats', metrics_data[0]['name'])
-        self.assertEqual('floats', metrics_data[1]['name'])
-        self.assertDictEqual({'subset': 'val'}, metrics_data[0]['context'])
-        self.assertDictEqual({'subset': 'train'}, metrics_data[1]['context'])
+        contexts = []
+        for m in metrics_data:
+            contexts.append((m['context'], m['name']))
+        self.assertTrue(({'subset': 'val'}, 'floats') in contexts)
+        self.assertTrue(({'subset': 'train'}, 'floats') in contexts)
 
     def test_run_info_get_metrics_only_api(self):
         client = self.client