Elliott C. Back: Internet & Technology

Hashmap Implementation in C

Posted in C, Code by Elliott Back on April 7th, 2005.

Last semester I wrote a Hashmap in C from scratch, which was an interesting experience. It’s the first datastructure I wrote in C, since most of the time I’m working with java, so I learned a lot about pointers and memory allocation (malloc/free). Just uncommented the “free()” at the end I think there is one deallocation bug that I didn’t have a chance to fix, and note that it has references to “semaphores” for thread safety, but you can remove those if you’re in a single threaded environment. Anyway, the full source code is below, as well as in pdf versions:

HashMap.h:

/*
 * Generic hashmap manipulation functions
 */
#ifndef __HASHMAP_H__
#define __HASHMAP_H__

#define MAP_MISSING -3  /* No such element */
#define MAP_FULL -2 	/* Hashmap is full */
#define MAP_OMEM -1 	/* Out of Memory */
#define MAP_OK 0 	/* OK */

/*
 * any_t is a pointer.  This allows you to put arbitrary structures in
 * the hashmap.
 */
typedef void *any_t;

/*
 * PFany is a pointer to a function that can take two any_t arguments
 * and return an integer. Returns status code..
 */
typedef int (*PFany)(any_t, any_t);

/*
 * map_t is a pointer to an internally maintained data structure.
 * Clients of this package do not need to know how hashmaps are
 * represented.  They see and manipulate only map_t's.
 */
typedef any_t map_t;

/*
 * Return an empty hashmap. Returns NULL if empty.
*/
extern map_t hashmap_new();

/*
 * Iteratively call f with argument (item, data) for
 * each element data in the hashmap. The function must
 * return a map status code. If it returns anything other
 * than MAP_OK the traversal is terminated. f must
 * not reenter any hashmap functions, or deadlock may arise.
 */
extern int hashmap_iterate(map_t in, PFany f, any_t item);

/*
 * Add an element to the hashmap. Return MAP_OK or MAP_OMEM.
 */
extern int hashmap_put(map_t in, int key, any_t value);

/*
 * Get an element from the hashmap. Return MAP_OK or MAP_MISSING.
 */
extern int hashmap_get(map_t in, int key, any_t *arg);

/*
 * Remove an element from the hashmap. Return MAP_OK or MAP_MISSING.
 */
extern int hashmap_remove(map_t in, int key);

/*
 * Get any element. Return MAP_OK or MAP_MISSING.
 * remove - should the element be removed from the hashmap
 */
extern int hashmap_get_one(map_t in, any_t *arg, int remove);

/*
 * Free the hashmap
 */
extern void hashmap_free(map_t in);

/*
 * Get the current size of a hashmap
 */
extern int hashmap_length(map_t in);

#endif __HASHMAP_H__

HashMap.c:

/*
 * Generic map implementation. This class is thread-safe.
 * free() must be invoked when only one thread has access to the hashmap.
 */
#include < stdlib.h >
#include < stdio.h >
#include < minithreads/hashmap.h >
#include < minithreads/synch.h >

#define INITIAL_SIZE 1024

// We need to keep keys and values
typedef struct _hashmap_element{
	int key;
	int in_use;
	any_t data;
} hashmap_element;

// A hashmap has some maximum size and current size,
// as well as the data to hold.
typedef struct _hashmap_map{
	int table_size;
	int size;
	hashmap_element *data;
	semaphore_t lock;
} hashmap_map;

/*
 * Return an empty hashmap, or NULL on failure.
 */
map_t hashmap_new() {
	hashmap_map* m = (hashmap_map*) malloc(sizeof(hashmap_map));
	if(!m) goto err;

	m->data = (hashmap_element*) calloc(INITIAL_SIZE, sizeof(hashmap_element));
	if(!m->data) goto err;

	m->lock = (semaphore_t) semaphore_create();
	if(!m->lock) goto err;
	semaphore_initialize(m->lock, 1);

	m->table_size = INITIAL_SIZE;
	m->size = 0;

	return m;
	err:
		if (m)
			hashmap_free(m);
		return NULL;
}

/*
 * Hashing function for an integer
 */
unsigned int hashmap_hash_int(hashmap_map * m, unsigned int key){
	/* Robert Jenkins' 32 bit Mix Function */
	key += (key << 12);
	key ^= (key >> 22);
	key += (key << 4);
	key ^= (key >> 9);
	key += (key << 10);
	key ^= (key >> 2);
	key += (key << 7);
	key ^= (key >> 12);

	/* Knuth's Multiplicative Method */
	key = (key >> 3) * 2654435761;

	return key % m->table_size;
}

/*
 * Return the integer of the location in data
 * to store the point to the item, or MAP_FULL.
 */
int hashmap_hash(map_t in, int key){
	int curr;
	int i;

	/* Cast the hashmap */
	hashmap_map* m = (hashmap_map *) in;

	/* If full, return immediately */
	if(m->size == m->table_size) return MAP_FULL;

	/* Find the best index */
	curr = hashmap_hash_int(m, key);

	/* Linear probling */
	for(i = 0; i< m->table_size; i++){
		if(m->data[curr].in_use == 0)
			return curr;

		if(m->data[curr].key == key && m->data[curr].in_use == 1)
			return curr;

		curr = (curr + 1) % m->table_size;
	}

	return MAP_FULL;
}

/*
 * Doubles the size of the hashmap, and rehashes all the elements
 */
int hashmap_rehash(map_t in){
	int i;
	int old_size;
	hashmap_element* curr;

	/* Setup the new elements */
	hashmap_map *m = (hashmap_map *) in;
	hashmap_element* temp = (hashmap_element *)
		calloc(2 * m->table_size, sizeof(hashmap_element));
	if(!temp) return MAP_OMEM;

	/* Update the array */
	curr = m->data;
	m->data = temp;

	/* Update the size */
	old_size = m->table_size;
	m->table_size = 2 * m->table_size;
	m->size = 0;

	/* Rehash the elements */
	for(i = 0; i < old_size; i++){
		int status = hashmap_put(m, curr[i].key, curr[i].data);
		if (status != MAP_OK)
			return status;
	}

	free(curr);

	return MAP_OK;
}

/*
 * Add a pointer to the hashmap with some key
 */
int hashmap_put(map_t in, int key, any_t value){
	int index;
	hashmap_map* m;

	/* Cast the hashmap */
	m = (hashmap_map *) in;

	/* Lock for concurrency */
	semaphore_P(m->lock);

	/* Find a place to put our value */
	index = hashmap_hash(in, key);
	while(index == MAP_FULL){
		if (hashmap_rehash(in) == MAP_OMEM) {
			semaphore_V(m->lock);
			return MAP_OMEM;
		}
		index = hashmap_hash(in, key);
	}

	/* Set the data */
	m->data[index].data = value;
	m->data[index].key = key;
	m->data[index].in_use = 1;
	m->size++; 

	/* Unlock */
	semaphore_V(m->lock);

	return MAP_OK;
}

/*
 * Get your pointer out of the hashmap with a key
 */
int hashmap_get(map_t in, int key, any_t *arg){
	int curr;
	int i;
	hashmap_map* m;

	/* Cast the hashmap */
	m = (hashmap_map *) in;

	/* Lock for concurrency */
	semaphore_P(m->lock);		

	/* Find data location */
	curr = hashmap_hash_int(m, key);

	/* Linear probing, if necessary */
	for(i = 0; i< m->table_size; i++){

		if(m->data[curr].key == key && m->data[curr].in_use == 1){
			*arg = (int *) (m->data[curr].data);
			semaphore_V(m->lock);
			return MAP_OK;
		}

		curr = (curr + 1) % m->table_size;
	}

	*arg = NULL;

	/* Unlock */
	semaphore_V(m->lock);

	/* Not found */
	return MAP_MISSING;
}

/*
 * Get a random element from the hashmap
 */
int hashmap_get_one(map_t in, any_t *arg, int remove){
	int i;
	hashmap_map* m;

	/* Cast the hashmap */
	m = (hashmap_map *) in;

	/* On empty hashmap return immediately */
	if (hashmap_length(m) <= 0)
		return MAP_MISSING;

	/* Lock for concurrency */
	semaphore_P(m->lock);

	/* Linear probing */
	for(i = 0; i< m->table_size; i++)
		if(m->data[i].in_use != 0){
			*arg = (any_t) (m->data[i].data);
			if (remove) {
				m->data[i].in_use = 0;
				m->size--;
			}
			semaphore_V(m->lock);
			return MAP_OK;
		}

	/* Unlock */
	semaphore_V(m->lock);

	return MAP_OK;
}

/*
 * Iterate the function parameter over each element in the hashmap.  The
 * additional any_t argument is passed to the function as its first
 * argument and the hashmap element is the second.
 */
int hashmap_iterate(map_t in, PFany f, any_t item) {
	int i;

	/* Cast the hashmap */
	hashmap_map* m = (hashmap_map*) in;

	/* On empty hashmap, return immediately */
	if (hashmap_length(m) <= 0)
		return MAP_MISSING;	

	/* Lock for concurrency */
	semaphore_P(m->lock);

	/* Linear probing */
	for(i = 0; i< m->table_size; i++)
		if(m->data[i].in_use != 0) {
			any_t data = (any_t) (m->data[i].data);
			int status = f(item, data);
			if (status != MAP_OK) {
				semaphore_V(m->lock);
				return status;
			}
		}

	/* Unlock */
	semaphore_V(m->lock);

        return MAP_OK;
}

/*
 * Remove an element with that key from the map
 */
int hashmap_remove(map_t in, int key){
	int i;
	int curr;
	hashmap_map* m;

	/* Cast the hashmap */
	m = (hashmap_map *) in;

	/* Lock for concurrency */
	semaphore_P(m->lock);

	/* Find key */
	curr = hashmap_hash_int(m, key);

	/* Linear probing, if necessary */
	for(i = 0; i< m->table_size; i++){
		if(m->data[curr].key == key && m->data[curr].in_use == 1){
			/* Blank out the fields */
			m->data[curr].in_use = 0;
			m->data[curr].data = NULL;
			m->data[curr].key = 0;

			/* Reduce the size */
			m->size--;
			semaphore_V(m->lock);
			return MAP_OK;
		}
		curr = (curr + 1) % m->table_size;
	}

	/* Unlock */
	semaphore_V(m->lock);

	/* Data not found */
	return MAP_MISSING;
}

/* Deallocate the hashmap */
void hashmap_free(map_t in){
	hashmap_map* m = (hashmap_map*) in;
	free(m->data);
	semaphore_destroy(m->lock);
	free(m);
}

/* Return the length of the hashmap */
int hashmap_length(map_t in){
	hashmap_map* m = (hashmap_map *) in;
	if(m != NULL) return m->size;
	else return 0;
}

This entry was posted on Thursday, April 7th, 2005 at 2:03 pm and is tagged with . You can follow any responses to this entry through the RSS 2.0 feed. You can leave a response, or trackback.

27 Responses to “Hashmap Implementation in C”

  1. Martin Bell says:

    Hi, thanks for your code, found it very useful. However there is a bug in hashmap_rehash. You are not checking the in_use flag when you add elements to the new map.

    Thanks

    Martin.

  2. francisco says:

    Good work! Please correct a little bug i found. in hashmap.c line 232. Function hashmap_get_one. Replace:
    *arg = (any_t) (m->data[i].data);
    with:
    arg = (any_t) (m->data[i].data);

  3. Anchal Khanna says:

    Code for synch, as many people have asked for it

    /*
    * synch.h
    *
    * Created on: Apr 12, 2012
    * Author: anchu
    */

    #ifndef __SYNCH_H__
    #define __SYNCH_H__

    typedef void * semaphore_t;

    semaphore_t semaphore_create();
    void semaphore_initialize(semaphore_t p_semaphore, int count);
    void semaphore_P(semaphore_t p_semaphore);
    void semaphore_V(semaphore_t p_semaphore);
    void semaphore_destroy(semaphore_t p_semaphore);

    #endif /* SYNCH_H_ */

    /*
    * synch.c
    *
    * Created on: Apr 12, 2012
    * Author: anchu
    */
    #include “synch.h”
    #include
    #include
    #include
    #include

    struct semaphore {
    sem_t mutex;
    };

    semaphore_t semaphore_create() {
    struct semaphore * sem = (struct semaphore *) malloc(
    sizeof(struct semaphore));
    return sem;
    }
    void semaphore_initialize(semaphore_t p_semaphore, int count) {
    struct semaphore * sem = (struct semaphore *) p_semaphore;
    if (p_semaphore != NULL)
    sem_init(&sem->mutex, 0, count);
    }
    void semaphore_P(semaphore_t p_semaphore) {
    struct semaphore * sem = (struct semaphore *) p_semaphore;
    if (p_semaphore != NULL) {
    sem_wait(&sem->mutex);
    }
    }

    void semaphore_V(semaphore_t p_semaphore) {
    struct semaphore * sem = (struct semaphore *) p_semaphore;
    if (p_semaphore != NULL) {
    sem_post(&sem->mutex);
    }
    }

    void semaphore_destroy(semaphore_t p_semaphore) {
    struct semaphore * sem = (struct semaphore *) p_semaphore;
    if (p_semaphore != NULL) {
    sem_destroy(&sem->mutex);
    }
    }

  4. Chetan says:

    sorry in the prev comment this(minithreads/synch.h)header file name is missing

  5. Chetan says:

    Can any one tell me where i can get this #include header file from …This is written in Hashmap.c

  6. melody says:

    Dude in your hashmap_get function you shouldn’t only check whether in_use is 1 but also if it has been pre-occupied before (or you will iterate over the whole map when the key can’t be found). You need a special value for that, say a random negative number.

    btw. consider linked lists for linking elements which do not create these hash clusters!

  7. Joe says:

    You hashmap overwrites existing values!

  8. vikas says:

    Hi Elliot,
    I find the code useful..
    Could you please forward me the include file so that i can do some modifications to it to serve my purpose.

    Thanks…
    Vikas

  9. Kajal says:

    Hello Elliot,

    Thanks for this posting of the code. I was wondering if you could send me the include files so that I get the fully functional code. As part of my project, I am doing a study of some of the implementations for hash table for space consumption check.

    Thanks.

    Regards,
    Kajal

  10. Srinivas says:

    Hi, Could you please let me know the steps to include n compile in windows ? I am using Win 32 C/C++ compiler comes with Windows SDK.

  11. saark says:

    Yo! You really didn’t understand the various hash methods (i.e. mod, knuth, etc). I mean, you used them all in your hash function. Read some more :)

  12. kingsimba says:

    I made a improment to your code and it’s 100 times faster now. Even 40% faster than the stdext::hash_map in VC2005.

    I will post my code in my blog latter tonight.

    The basic idea is that:
    1. hashmap_get should exit at the first encouter with an empty slot. Or else it will loop through all elements at a miss.

    2. hashmap_remove should rehash the elements between the deleted element and the first empty slot after it.

    3. table_size should always be two times bigger than size.
    This will ensure that there will always be a empty slot not too far way. This will make hashmap_get and hashmap_remove 20 times faster in my case.

    4. some other bug fixes.

  13. art Zweiban says:

    It’s pretty cool to see everybody with interesting names demanding additional work on your part, Mr. Back.

  14. mdiarra says:

    Hi all,

    i find this project very interesting. thanks a lot Elliot. But i need the file sync.h. can you provide it to me please.

    regards

  15. gruva says:

    it would be better if you gave a more clear example to use the module.

  16. tiku says:

    Hey, thx a lot, ur code is great, it really helped me alot…but It just took me along time to figure out. but thx so much…

  17. Collections says:

    Java Collections Framework provides a well designed set of interfaces and classes that support operations on a collections of objects.

  18. sunil says:

    Hi Elliott.
    The code looks good and also easly usable but the 2 include files are needed to you this code and hence makes it un-useable. pls provide me those include files as soon as possible.

    Thanks
    Sunil

  19. mans says:

    Can you please attach the sync.h file and any associated implementation file, as I can see many semaphore related calls but have no idea where to find them !

  20. Boris says:

    Can you give an example of how to use the iteration function?

    Thanks

  21. Elliott Back says:

    Hey Sujay. It’s not obvious, but the hashmap_put function calls hashmap_hash to find an index. Hashmap_hash uses linear probing to find the next available spot. Of course, if the hashmap is full when you call hashmap_put, a resize call will be issued. Therefore, two keys never “hash” to the same array location.

  22. sujay says:

    I think the put function will fail, if you have two key values generating the same hash value in the array range. (say keys “abc” and “xyz” generate a hash code of ‘99′ in the array range 0-1024 (According to the program i think the key value will be ‘99′..right ??). How will this put function deal with this?? It will overwrite the existing one.

    I think the solution to this is to store even the key values (“abc” or “xyz” ) in the structure hash_element, so that we can compare this while inserting and this must be unique.

  23. Elliott Back says:

    As you can tell from the datastructure, you need to be passing pointers to the object memory. If you allocate a static struct “s”, to put it in the map you would call the

    hashmap_put(map_t in, int key, any_t value)

    method. You can think of any_t as an alias for void *: it’s defined in the header somewhere, So, you’d call:

    hashmap_put(in, 1337, &s);

    If s is already a pointer type, just call:

    hashmap_put(in, 1337, s);

    To get something out of the datastructure, again just pass a pointer:

    your_struct_name s;
    hashmap_get(in, 1337, &s);

    Don’t forget to check the return values! All the hashmap methods should return a constant indicating success.

  24. vinod says:

    Thanks for the example.
    Can we store any type of variable e.g. struture as a value ?
    If yes how can I retrieve the values from the hash map.
    The hashmap_get is returning me junk values.I would appreciate a code sample.

  25. Elliott Back says:

    The above code allocates a datastructure for a filesystem I wrote, allocates the blocks hashmap, which keeps track of all the physical blocks of memory we can use, and then I show a for loop which puts the blocks into the hashmap indexed by the current length of the hashmap, i.e. 0, 1, … , n. This way I end up with a hashmap of all the blocks currently in use in the filesystem (upon load).

  26. Elliott Back says:

    struct minifile {
        /* Other blocks */
        map_t blocks;
    };

    typedef struct minifile* minifile_t;

    // Allocate a file
    file = (minifile_t) malloc(sizeof(struct minifile));
    if(!file) return NULL;

    // Initialize blocks
    file->blocks = hashmap_new();
    if (!file->blocks) return NULL;

    /* Get blocks */
    for(i = 0; i < num_blocks; i++){
        int *temp = (int *) malloc(sizeof(1));
        *temp = *inode; inode++;
        if(hashmap_put(file->blocks, hashmap_length(file->blocks), temp) != MAP_OK)
        return NULL;
    }

  27. vinod says:

    can you send me sample example on how to use this hashmap ?

Leave a Reply

Powered by WP Hashcash