Hierarchical Navigable Small Worlds: The Meet
As part of a team that is mostly creating solutions for machine learning and LLM-based systems, I love learning the internals after getting some idea for the tools and algorithms, and today's topic is Hierarchical Navigable Small Worlds or HNSW as a short form.
So, the first question is, what is that? Well, even if the naming is something we might scare, I can simplify its definition as "yet another search tool" or "another vector search way".
Now, let's take a look deeper with Wikipedia definition
The Hierarchical navigable small world (HNSW) algorithm is a graph-based approximate nearest neighbor search technique used in many vector databases.
We can see that is used for searching something.
How does it work?
First, we need to know that HNSW is an approximate k-nearest neighbor algorithm, which means that it is not a 100% accurate operation, and it offers logarithmic scaling for high-dimensional data.
When we look at its internals, it is expected to see a "hierarchy," which is the key point of this structure. It uses multiple layers to reach the search query. With each layer, the query gets more detail. I resemble this approach to lecture books. Like them, it first shows us the basics, then the detail comes more and more with the next pages/chapters.
Another thing that we should know about the layers is that from the start layer to the end layer, nodes are increased, or we may say opposite, there is not much node on the first layers.
We may say that this structure consist from two different approach;
- NSW, and
- Skip List (see this blog post for more info Skip List: A Probabilistic Data Structure for Search, Insert and Delete)
It is all good for now, but if we don't create a well-distributed layer, it has no meaning. To make the layers more "navigable," there are 2 key parameters;
- Mmax; This parameter is transferred from NSW, which means that number of bi-directional links per layer
- efConstruction: An HNSW-specific parameter is used to size the dynamic candidate list for construction.
Simple implementation
import numpy as np
import random
from typing import List, Tuple, Set
import math
class HNSW:
def __init__(self, dim: int, max_elements: int, M: int = 16, ef_construction: int = 200):
"""
Initialize HNSW index.
Args:
dim: Dimension of vectors
max_elements: Maximum number of elements to store
M: Number of bi-directional links per layer
ef_construction: Size of dynamic candidate list for construction
"""
self.dim = dim
self.max_elements = max_elements
self.M = M
self.ef_construction = ef_construction
# Initialize layers with at least one layer
self.layers: List[List[Tuple[np.ndarray, Set[int]]]] = [[]]
self.entry_point: int | None = None
self.max_layer = 0
def _random_level(self) -> int:
"""Generate random level for new element."""
return int(-math.log(random.random()) * self.M)
def _distance(self, a: np.ndarray, b: np.ndarray) -> float:
"""Calculate L2 distance between two vectors."""
return float(np.linalg.norm(a - b))
def _select_neighbors(self, q: np.ndarray, candidates: List[Tuple[float, int]], M: int) -> List[int]:
"""Select M nearest neighbors from candidates."""
candidates.sort(key=lambda x: x[0])
return [idx for _, idx in candidates[:M]]
def _search_layer(self, q: np.ndarray, entry_points: List[int], ef: int, layer: int) -> List[Tuple[float, int]]:
"""Search in a specific layer."""
if not entry_points or layer >= len(self.layers):
return []
# Ensure the layer exists and has elements
if layer >= len(self.layers) or not self.layers[layer]:
return []
candidates = [(self._distance(q, self.layers[layer][ep][0]), ep) for ep in entry_points]
candidates.sort(key=lambda x: x[0])
results = candidates[:ef]
visited = set(ep for _, ep in candidates)
while candidates:
current_dist, current = candidates.pop(0)
if current_dist > results[-1][0]:
break
for neighbor in self.layers[layer][current][1]:
if neighbor in visited:
continue
visited.add(neighbor)
dist = self._distance(q, self.layers[layer][neighbor][0])
if dist < results[-1][0] or len(results) < ef:
candidates.append((dist, neighbor))
candidates.sort(key=lambda x: x[0])
if len(results) < ef:
results.append((dist, neighbor))
else:
results[-1] = (dist, neighbor)
results.sort(key=lambda x: x[0])
return results
def insert(self, vector: np.ndarray) -> None:
"""Insert a vector into the index."""
if len(vector) != self.dim:
raise ValueError(f"Vector dimension must be {self.dim}")
# Generate random level for the new element
level = self._random_level()
# Add new layers if necessary
while len(self.layers) <= level:
self.layers.append([])
# Insert the vector
vector_id = len(self.layers[0])
self.layers[0].append((vector, set()))
# If this is the first element, make it the entry point
if self.entry_point is None:
self.entry_point = vector_id
self.max_layer = level
return
# Insert in each layer
current_ep = self.entry_point
for layer in range(min(self.max_layer, level), -1, -1):
# Find neighbors in current layer
neighbors = self._search_layer(vector, [current_ep], self.ef_construction, layer)
if not neighbors:
continue
# Select neighbors and add bidirectional links
selected_neighbors = self._select_neighbors(vector, neighbors, self.M)
for neighbor_id in selected_neighbors:
self.layers[layer][vector_id][1].add(neighbor_id)
self.layers[layer][neighbor_id][1].add(vector_id)
# Update entry point for next layer
if layer > 0:
current_ep = selected_neighbors[0]
# Update entry point if necessary
if level > self.max_layer:
self.entry_point = vector_id
self.max_layer = level
def search(self, query: np.ndarray, k: int = 1) -> List[Tuple[float, int]]:
"""Search for k nearest neighbors of query vector."""
if self.entry_point is None:
return []
# Start from top layer
current_ep = self.entry_point
# Find the highest layer that contains the entry point
top_layer = 0
for layer in range(self.max_layer, -1, -1):
if layer < len(self.layers) and current_ep < len(self.layers[layer]):
top_layer = layer
break
# Search through layers
for layer in range(top_layer, 0, -1):
candidates = self._search_layer(query, [current_ep], 1, layer)
if candidates:
current_ep = candidates[0][1]
# Search in bottom layer
candidates = self._search_layer(query, [current_ep], self.ef_construction, 0)
candidates.sort(key=lambda x: x[0])
return candidates[:k] hnsw.py
To use the this class
import numpy as np
from hnsw import HNSW
def main():
# Create a simple HNSW index for 2D vectors
index = HNSW(dim=2, max_elements=1000)
# Generate some random 2D vectors
vectors = np.random.rand(10, 2)
# Insert vectors
for vector in vectors:
index.insert(vector)
# Create a query vector
query = np.array([0.5, 0.5])
# Search for nearest neighbors
neighbors = index.search(query, k=3)
print("Query vector:", query)
print("\nNearest neighbors:")
for dist, idx in neighbors:
print(f"Vector: {vectors[idx]}, Distance: {dist:.4f}")
if __name__ == "__main__":
main() example.py
And the output will be something like this;
Query vector: [0.5 0.5]
Nearest neighbors:
Vector: [0.28143965 0.53774672], Distance: 0.2218
Vector: [0.57665313 0.27801642], Distance: 0.2348
Vector: [0.30992482 0.7956484 ], Distance: 0.3515output
References





