/****************************************************************************/
/*****                                                                  *****/
/*****                     malloc.heapServer.c                          *****/
/*****                                                                  *****/
/*****  Program written and tested by Mazen Kharbutli                   *****/
/*****  Based on the research work of Mazen Kharbutli, Yan Solihin,     *****/
/*****                                and Milos Prvulovic               *****/   
/*****                                                                  *****/
/*****  This is my FULLY-OPTIMIZED 'Heap Server' library                *****/
/*****                                                                  *****/
/*****  November 21, 2005                                               *****/
/*****                                                                  *****/
/****************************************************************************/


/****************************************************************************/
/*****                                                                  *****/
/*****                        NO WARRANTY                               *****/
/*****                                                                  *****/
/***** This is a research prototype implementation. It may not be fully *****/
/*****     debugged and we do not guarantee the absence of all security *****/
/*****     vulnerabilities.                                             *****/
/***** It is provided for free "AS IS" with no warranties whatsoever.   *****/
/***** The entire risk of using this program falls on the user.         *****/
/***** The authors of this software are in no way liable to you for     *****/
/*****     any damages, of any type, arising from using this software.  *****/
/*****                                                                  *****/
/****************************************************************************/


/****************************************************************************/
/*****                           USAGE                                  *****/
/*****                                                                  *****/
/***** To use Heap Server:                                              *****/
/***** 1. Call the function malloc_stats() at the end of your program.  *****/
/***** 2. Compile this file with the desired optimization LEVEL         *****/
/*****    (see below) and link to your C/C++ application.               *****/
/*****                                                                  *****/
/***** Heap Server has been designed for, and only tested on, systems   *****/
/*****     with x86 ISA processors running Linux.                       *****/
/*****                                                                  *****/
/***** To include timing stats, compile with the -DMALLOC_TIMING flag   *****/
/****************************************************************************/


/****************************************************************************/
/*****                        REFERENCES                                *****/
/*****                                                                  *****/
/***** To better understand Heap Server, please refer to:               *****/
/***** 1. My Ph.D. dissertation from North Carolina State University:   *****/
/*****   Improving the Security of the Heap through Inter-Process       *****/
/*****   Protection and Intra-Process Temporal Protection               *****/
/***** 2. The following ASPLOS paper:                                   *****/
/*****   Mazen Kharbutli, Xiaowei Jiang, Yan Solihin, Guru              *****/
/*****   Venkataramani, and Milos Prvulovic, “Comprehensively and       *****/
/*****   Efficiently Protecting the Heap”, in the Proceedings of the    *****/
/*****   ACM International Symposium on Architectural Support for       *****/
/*****   Programming Languages and Operating Systems (ASPLOS), Oct. 2006*****/ 
/****************************************************************************/


#include<malloc.h>
#include<stddef.h>
#include<unistd.h>
#include<stdio.h>
#include<fcntl.h>
#include<stdlib.h>
#include<sys/types.h>
#include<sys/msg.h>
#include<sys/ipc.h>
#include<sys/mman.h>

/****************************************************************************/
/*****                   OPTIMIZATION LEVELS                            *****/
/*****                                                                  *****/
/***** LEVEL indicates the optimization level of Heap Server            *****/
/*****    it is set during compilation using the -DLEVEL=x flag         *****/
/***** LEVEL 0: Blocking communication with no optimizations            *****/
/***** LEVEL 1: Non-blocking communication                              *****/
/***** LEVEL 2: Non-blocking communication + Bulk deallocation          *****/
/***** LEVEL 3: Non-blocking communication + Bulk deallocation +        *****/
/*****          pre-allocation                                          *****/
/***** LEVEL 4: Non-blocking communication + Bulk deallocation +        *****/
/*****          pre-allocation + dual-copy protection                   *****/
/****************************************************************************/

#ifndef LEVEL
#define LEVEL 0
#endif

#if LEVEL >= 5 || LEVEL < 0
#undef  LEVEL
#define LEVEL 0
#endif

/****************************************************************************/
/************ Definitions, Globals, Structures, and Prototypes **************/
/****************************************************************************/

#define ALIGNMENT        8  /* alignment of chunk sizes in bytes */
#define ALIGN_MASK       7  /* alignment mask */
#define MINSIZE         16  /* smallest possible chunk size in bytes */
#define NBINS          128  /* number of free chunks bins */
#define MIN_LARGE_SIZE 512  /* smallest chunk size in non-fixed-size bins */
#define MAX_FAST_SZ    128  /* largest chunk size that cannot be consolidated */
#define NUM_CHUNKS    1024  /* number of chunk structures to allocate at once */
#define TOPTRIM    1048576  /* minimum size of top to trigger trimming */
#define TOPSLACK    262144  /* size of top to keep after trimming */
#define MAX_RAND_OBF    16  /* maximum number of chunks to traverse in a bin */
                            /* when randomly choosing a chunk for layout obf. */

/* total deallocs/allocations to trigger bulk-deallocation/pre-allocation */
#define BULK_DEALLOC_THRESH 1024
#define PRE_ALLOC_THRESH    1024

/* number of pointers in a single bulk-dealloc/pre-allocation message */
#define BULK_DEALLOC_PTRS     64
#define PRE_ALLOC_PTRS       512

/* minimum number of chunks in bin to make it possible to allocate chunks from it */
#define MIN_CHUNKS_TO_USE_BIN  1

/* conversion from a pointer to an address and vice-versa */
#define chunk2mem(p)	((void*)(p))
#define mem2chunk(mem)  ((ulong)(mem))

/* pad chunk size request to meet size alignment requirements */
#define req2size(req) (((req) + ALIGN_MASK < MINSIZE) ? MINSIZE : \
                       ((req) + ALIGN_MASK) & ~ALIGN_MASK)

/* There are two types of free-chunks bins:    */
/*    smallbins hold chunks of fixed sizes     */
/*    largebins hold chunks within size ranges */   
#define in_smallbin_range(sz)  ((ulong)(sz) < (ulong)MIN_LARGE_SIZE)
#define smallbin_index(sz)     (((ulong)(sz)) >> 3)

/* remove the chunk pointed to by P->ptr from a free chunks bin and place */
/* in bins[0] for future usage */
#define unlink(P, T) { T           = P->ptr;      \
                       P->ptr      = T->ptr;      \
		       T->ptr      = bins[0].ptr; \
		       bins[0].ptr = T; }

/* this macro reads the system's time stamp counter */
#define rdtscll(val) __asm__ __volatile__ ("rdtsc" : "=A" (val))
unsigned long long mallocTime, callocTime, reallocTime, freeTime, 
                   sleepTime, bulkDeallocTime, preAllocTime, tt1, tt2;

/* useful counters, heap addresses, sizes, and pointers */
ulong  first_time = 0;     /* to identify the first heap request */
ulong  top, topSize;       /* top-most available free chunk (bordering */
                           /* the end of available heap memory) and its size */
ulong  pagesize, pagemask; /* system's page size and its mask */
ulong  heapStart;          /* base address of application's heap */    
ulong  newChunkReq;        /* number of new chunk structure requests made */
ulong  bitMapSize;         /* cuurent size of the memory bit-map arrays */
ulong  doPreAlloc;         /* to indicate if pre-allocation is in use or not */
ulong  ptrsArrayPtr[64];   /* pointers to entries in bulk deallocation and */ 
                           /* pre-allocation messages and arrays */
int    to_sbrk;            /* used by Heap Server to keep track of the amount */
                           /* of memory the application has to sbrk() */
static int dev_zero_fd;

ulong  mallocs, callocs, reallocs, frees, 
       bulkDeallocs, preAllocs, max_foot_print;   /* statistics */

/* memory bit-map arrays structure */
struct memMapWord { ulong word1, word2; } *memMap;

/* free chunk structure and bins */
struct Chunk { ulong address; struct Chunk* ptr; } bins[NBINS], *chunks;
typedef struct Chunk* mchunkptr;
typedef struct Chunk* mbinptr;

/* useful simple functions */
void errNexit(char *str) { fprintf(stderr, "Mazen: %s\n", str); exit(0); } 
void do_sbrk(int sz) { if(sbrk(sz) == (void*)(-1)) errNexit("sbrk failed"); }

/* Three different types of messages */
struct NORMAL_MSG            /* message for normal requests/replies */
{
   long  mtype;              /* request/reply type */
   int   value;              /* used to pass additional info with message */
   void* ptr;                /* pointer to a chunk in the application's heap */
} Request, Reply;

struct BULK_DEALLOC_STRUCT   /* bulk deallocation message */
{
   long  mtype;
   void* ptrs[BULK_DEALLOC_PTRS];
} bulkDeallocMsg;

struct PRE_ALLOC_STRUCT      /* pre-allocation message */
{
   long  mtype;
   int   value;
   void* ptrs[PRE_ALLOC_PTRS];
} preAllocMsg[64];

/* arrays of pointers used as dual copies for protection at application's side */
void **bulkDeallocBackup, **preAllocBackup[64];

/* message sizes and message queues specifics */
size_t msgSize, bulkDeallocMsgSize, preAllocMsgSize;
int    qid1, qid2; 
pid_t  pid;
key_t  key1, key2;
  

/****************************************************************************/
/************************** Chunk Size and Inuse ****************************/
/****************************************************************************/

ulong wordMasks[]  = { 0xFFFFFFFF,
                       0xFFFFFFFE, 0xFFFFFFFC, 0xFFFFFFF8, 0xFFFFFFF0, 
                       0xFFFFFFE0, 0xFFFFFFC0, 0xFFFFFF80, 0xFFFFFF00, 
		       0xFFFFFE00, 0xFFFFFC00, 0xFFFFF800, 0xFFFFF000, 
		       0xFFFFE000, 0xFFFFC000, 0xFFFF8000, 0xFFFF0000, 
		       0xFFFE0000, 0xFFFC0000, 0xFFF80000, 0xFFF00000, 
		       0xFFE00000, 0xFFC00000, 0xFF800000, 0xFF000000, 
		       0xFE000000, 0xFC000000, 0xF8000000, 0xF0000000, 
		       0xE0000000, 0xC0000000, 0x80000000, 0x00000000 };
ulong bitMasks[]   = { 0x00000001, 0x00000002, 0x00000004, 0x00000008, 
                       0x00000010, 0x00000020, 0x00000040, 0x00000080, 
		       0x00000100, 0x00000200, 0x00000400, 0x00000800, 
		       0x00001000, 0x00002000, 0x00004000, 0x00008000, 
		       0x00010000, 0x00020000, 0x00040000, 0x00080000, 
		       0x00100000, 0x00200000, 0x00400000, 0x00800000, 
		       0x01000000, 0x02000000, 0x04000000, 0x08000000, 
		       0x10000000, 0x20000000, 0x40000000, 0x80000000 };

/* this function is used by Heap Server and returns the size of a chunk */
/* in the application's heap given its base address */
ulong chunkSize(ulong address)
{
   ulong offset, startWord, bitPos1, bitPos2, masked;

   offset    = (address - heapStart) >> 3;
   startWord = offset >> 5;
   bitPos1   = offset & 31;   
   masked    = memMap[startWord].word1 & wordMasks[bitPos1+1];
   
   if(masked != 0)
   {
      asm("bsfl %1,%0\n\t"
             : "=r" (bitPos2) 
             : "g"  (masked));
      return ((bitPos2 - bitPos1) << 3);
   }
   else if(memMap[startWord+1].word1 == 0) 
        return (memMap[startWord+1].word2 & 0x7FFFFFFF);
   
   asm("bsfl %1,%0\n\t"
          : "=r" (bitPos2) 
          : "g"  (memMap[startWord+1].word1));
   return ((bitPos2 + 32 - bitPos1) << 3);
}

/* this function is similar to 'ulong chunkSize(ulong address)' above except */
/* that it returns 0 if the chunk is allocated (not free) */
ulong chunkSizeNused(ulong address)
{
   ulong offset, startWord, bitPos1, bitPos2, masked;

   offset    = (address - heapStart) >> 3;
   startWord = offset >> 5;
   bitPos1   = offset & 31;   
   masked    = memMap[startWord].word1 & wordMasks[bitPos1+1];
   
   if((memMap[startWord].word2 & bitMasks[bitPos1]) != 0) return 0;

   if(masked != 0)
   {
      asm("bsfl %1,%0\n\t"
             : "=r" (bitPos2) 
             : "g"  (masked));
      return ((bitPos2 - bitPos1) << 3);
   }
   else if(memMap[startWord+1].word1 == 0) 
        return (memMap[startWord+1].word2 & 0x7FFFFFFF);
   
   asm("bsfl %1,%0\n\t"
          : "=r" (bitPos2) 
          : "g"  (memMap[startWord+1].word1));
   return ((bitPos2 + 32 - bitPos1) << 3);
}

/* this function is used by Heap Server and returns the size of a preceding */
/* chunk in the application's heap given a base address. It returns 0 if that */
/* preceding chunk is allocated (not free) */
ulong prevSize(ulong address)
{
   ulong offset, startWord, bitPos1, bitPos2, masked;
   
   if(address == heapStart) return 0;

   offset    = (address - heapStart) >> 3;
   startWord = offset >> 5;
   bitPos1   = offset & 31;   
   masked    = memMap[startWord].word1 & ~wordMasks[bitPos1];

   if(bitPos1 != 0 && ((memMap[startWord].word2 & bitMasks[bitPos1-1]) != 0)) 
      return 0;
   if(bitPos1 == 0 && ((memMap[startWord-1].word2 & bitMasks[31]) != 0))      
      return 0;
   
   if(masked != 0)
   {
      asm("bsrl %1,%0\n\t"
             : "=r" (bitPos2) 
             : "g"  (masked));
      return ((bitPos1 - bitPos2) << 3);
   }
   else if(memMap[startWord-1].word1 == 0) 
        return (memMap[startWord-1].word2 & 0x7FFFFFFF);
   
   asm("bsrl %1,%0\n\t"
          : "=r" (bitPos2) 
          : "g"  (memMap[startWord-1].word1));
   return ((bitPos1 + 32 - bitPos2) << 3);
}

/* this function is used by Heap Server to fill the memory bit-map arrays */
/* with a chunk's information */
void writeSize(ulong address, ulong size, ulong inuse)
{
   ulong offset1, offset2, startWord, endWord, bitPos1, bitPos2;
   ulong bits, temp1, temp2;
   
   bits      = size >> 3;
   offset1   = (address - heapStart) >> 3;
   offset2   = offset1 + bits;
   startWord = offset1 >> 5;
   endWord   = offset2 >> 5;
   bitPos1   = offset1 & 31; 
   bitPos2   = offset2 & 31;  
   
   memMap[startWord].word1 = memMap[startWord].word1 | bitMasks[bitPos1];
   memMap[endWord].word1   = memMap[endWord].word1   | bitMasks[bitPos2];
   
   if(startWord == endWord)
      memMap[startWord].word1 = memMap[startWord].word1 & 
                            (~wordMasks[bitPos1+1] | wordMasks[bitPos2]);
   else
   {   
      if((endWord - startWord) > 1)
      {
         memMap[startWord+1].word1 = memMap[endWord-1].word1 = 0;
         memMap[startWord+1].word2 = memMap[endWord-1].word2 = size;
      }
   
      memMap[startWord].word1 = memMap[startWord].word1 & ~wordMasks[bitPos1+1];
      memMap[endWord].word1   = memMap[endWord].word1   & wordMasks[bitPos2];
   }
   
   if(inuse == 1)
        memMap[startWord].word2 = memMap[startWord].word2 | bitMasks[bitPos1];
   else memMap[startWord].word2 = memMap[startWord].word2 & (~bitMasks[bitPos1]);
   
   if(bitPos2 == 0) { temp1 = endWord - 1; temp2 = 31;          }
   else             { temp1 = endWord;     temp2 = bitPos2 - 1; }
   
   if(inuse == 1)
        memMap[temp1].word2 = memMap[temp1].word2 | bitMasks[temp2];
   else memMap[temp1].word2 = memMap[temp1].word2 & (~bitMasks[temp2]);
}      

/* this function is used by Heap Server to update the bit-map arrays when */
/* a chunk is re-allocated */
void setInUse(ulong address, ulong size)
{
   ulong offset1, offset2, startWord, endWord, bitPos1, bitPos2;
   ulong bits, temp1, temp2;
   
   bits      = size >> 3;
   offset1   = (address - heapStart) >> 3;
   offset2   = offset1 + bits;
   startWord = offset1 >> 5;
   endWord   = offset2 >> 5;
   bitPos1   = offset1 & 31; 
   bitPos2   = offset2 & 31;  
   
   memMap[startWord].word2 = memMap[startWord].word2 | bitMasks[bitPos1];
   
   if(bitPos2 == 0) { temp1 = endWord - 1; temp2 = 31;          }
   else             { temp1 = endWord;     temp2 = bitPos2 - 1; }
   
   memMap[temp1].word2 = memMap[temp1].word2 | bitMasks[temp2];
}    


/****************************************************************************/
/******************************** Bins **************************************/
/****************************************************************************/

/* Indexing

	   Size Range [ , )	 Bins Range	Spacing
	   ----------------	 ----------	-------
	           8-512	     1-63	     8 (exact)
	        512-2048	    64-87	    64
	       2048-8192	    88-99	   512
	      8192-32768	  100-111	  2048
	    32768-131072	  112-123	  8192
	   131072-524288	  124-126       131072
	        >=524288	      127               */

/* this function is used by Heap Server and returns the index of the */
/* free-chunks largebin corresponding to a certain size */
ulong largebin_index(ulong size) 
{
   ulong shift[] = {0, 0, 0, 0, 0, 0, 0, 0, 0,
                    6, 6, 9, 9, 11, 11, 13, 13, 17, 17};
   ulong add[]   = {0, 0, 0, 0, 0, 0, 0, 0, 0,
                    56, 56, 84, 84, 96, 96, 108, 108, 123, 123};
   
   ulong  m, shifts, adds;

   if(size >= 0x80000) return 127;   

   __asm__("bsrl %1,%0\n\t"
           : "=r" (m) 
           : "g"  (size));

   shifts = shift[m];
   adds   = add[m];
   
   return ((size >> shifts) + adds);
}

/* this function is used by Heap Server to allocate new chunk structs that it */
/* uses to store free chunks info in bins */
mchunkptr getNewChunks()
{
   int i;
   chunks = (mchunkptr)mmap(0, (sizeof(struct Chunk) * NUM_CHUNKS),
             PROT_READ|PROT_WRITE, MAP_PRIVATE, dev_zero_fd, 0);
   for(i=1; i<(NUM_CHUNKS-1); i++) chunks[i].ptr = &chunks[i+1];
   chunks[NUM_CHUNKS-1].ptr = 0;
   bins[0].ptr = &chunks[1];  
   newChunkReq++;
   return &chunks[0];
}

/* this function is used by Heap Server to add a deallocated chunk to the */ 
/* appropriate bin */
void add_to_bin(ulong address, ulong size)
{
   ulong     idx;
   mchunkptr temp;
   mchunkptr p;
   
   p = bins[0].ptr;
   if(p == 0) p = getNewChunks(); 
   else bins[0].ptr = p->ptr;
   
   p->address = address;
      
   if(in_smallbin_range(size)) 
   {
      idx           = smallbin_index(size);
      p->ptr        = bins[idx].ptr;
      bins[idx].ptr = p;
   }
   else 
   {
      idx  = largebin_index(size);
      temp = &bins[idx];
      
      while(temp->ptr != 0 && size > chunkSize(temp->ptr->address)) 
	 temp = temp->ptr;
      
      p->ptr    = temp->ptr;
      temp->ptr = p;
   }
   
   bins[idx].address++;
}

/* this function is used by Heap Server to locate and remove a chunk from */
/* the appropriate bin */
void remove_from_bin(ulong address, ulong size)
{
   ulong     idx, i;
   mchunkptr p, temp;
   
   if(in_smallbin_range(size)) idx = smallbin_index(size);
   else                        idx = largebin_index(size);

   p = &bins[idx];
   while(p->ptr->address != address) p = p->ptr;
   
   unlink(p, temp);
   
   bins[idx].address--; 
}


/****************************************************************************/ 
/********************* Application Side malloc Library **********************/
/****************************************************************************/

void HS_init_state(ulong);

/* macro for sending a request from the application to Heap Server */
#define doRequest(t, v, p) \
   Request.mtype = t; \
   Request.value = v; \
   Request.ptr   = p; \
   msgsnd(qid1, &Request, msgSize, 0);

/* this function is called on the first heap request to initialize everything */
void malloc_init_state()
{
   first_time = 1;
   
   ulong seed = 1;    /* change this for more randomness */
   srand(seed);
   
   /* generate the random message queues keys */
   key1 = rand() % 2147483647;
   key2 = rand() % 2147483647;
   
   /* set up the message queues */
   qid1 = msgget(key1, 0700 | IPC_CREAT); 
   qid2 = msgget(key2, 0700 | IPC_CREAT);   
   
   /* set up the different message sizes */
   msgSize            =  sizeof(void*)                      + sizeof(int);
   bulkDeallocMsgSize = (sizeof(void*) * BULK_DEALLOC_PTRS);
   preAllocMsgSize    = (sizeof(void*) * PRE_ALLOC_PTRS)    + sizeof(int);
   
   /* initialize stats */
   mallocTime   = callocTime = reallocTime    = freeTime = 0;
   mallocs      = callocs    = reallocs       = frees    = 0;
   bulkDeallocs = preAllocs  = max_foot_print = 0;
   
   /* initialize bulk deallocation and pre-allocation messages and structs */
   doPreAlloc = 0;
   
   bulkDeallocMsg.mtype = 9; 
   ptrsArrayPtr[0]      = 0;
   
   ulong i;
   for(i=1; i<64; i++) 
   { preAllocMsg[i].mtype = 0; ptrsArrayPtr[i] = PRE_ALLOC_PTRS; }
   
   /* set up the dual copy structures */
   #if LEVEL >= 4
   bulkDeallocBackup    = (void*)sbrk(sizeof(void*)*BULK_DEALLOC_PTRS);
   for(i=0; i<64; i++) 
      preAllocBackup[i] = (void*)sbrk(sizeof(void*)*PRE_ALLOC_PTRS);
   #endif
   
   /* find and align the heap start address */
   if((((ulong)(sbrk(0))) & ALIGN_MASK) == (ALIGNMENT/2)) sbrk((ALIGNMENT/2)); 
   ulong hStart = (ulong)(sbrk(0)); 
   if((hStart & ALIGN_MASK) != 0)
      errNexit("sbrk(0) not correctly aligned at start");
      
   /* fork Heap Server */
   pid = fork();
   if(pid == 0) { HS_init_state(hStart); exit(0); }
   else if(pid < 0) errNexit("forking failed");
}

/* this function returns a pointer to a new chunk */
void* getNewChunk(size_t bytes, ulong type)
{
   /* add random pad to request size for obfuscation */
   if(bytes >= 512) bytes += (rand() % 65);
   
   /* align request size */
   ulong nb = req2size(bytes);
   
   /* use a pre-allocated chunk and possibly request new pre-allocated chunks */
   #if LEVEL >= 3
   if(doPreAlloc == 1 && nb < 512)
   {
      void* mem;
      ulong idx = nb >> 3;
      
      /* out of pre-allocated chunks, request new chunks from Heap Server */
      if(ptrsArrayPtr[idx] == PRE_ALLOC_PTRS)
      {
         doRequest(7, nb, 0);
	 ptrsArrayPtr[idx] = 0;
	 preAllocs++;
	 while(msgrcv(qid2, &preAllocMsg[idx], preAllocMsgSize, 
	               0, IPC_NOWAIT) == -1) ;
	 if(preAllocMsg[idx].value != 0) do_sbrk(preAllocMsg[idx].value);
	 
	 /* save dual copy */
	 #if LEVEL >= 4
	 ulong i;
	 for(i=0; i<PRE_ALLOC_PTRS; i++)
	    preAllocBackup[idx][i] = preAllocMsg[idx].ptrs[i];
	 #endif
      }
      
      /* use a pre-allocated chunk */
      mem = preAllocMsg[idx].ptrs[ptrsArrayPtr[idx]];
      
      /* compare to the dual copy */
      #if LEVEL >= 4
      if(mem != preAllocBackup[idx][ptrsArrayPtr[idx]])
         errNexit("mismatched pre-allocated pointers");
      #endif
      
      ptrsArrayPtr[idx]++;
      
      return mem;
   }
   
   if((mallocs+callocs) >= PRE_ALLOC_THRESH) doPreAlloc = 1;   
   #endif
   
   /* perform a normal malloc request/reply */
   doRequest(type, nb, 0);
   while(msgrcv(qid2, &Reply, msgSize, 0, IPC_NOWAIT) == -1) ;
   
   if(Reply.value != 0) do_sbrk(Reply.value);
   
   return Reply.ptr;
}

/* malloc() */
void* malloc(size_t bytes)
{
   if(first_time == 0) malloc_init_state();   
   
   mallocs++;
   
   #ifdef MALLOC_TIMING
   rdtscll(tt1);
   #endif
   
   void* mem = getNewChunk(bytes, 1);
         
   #ifdef MALLOC_TIMING
   rdtscll(tt2);
   mallocTime += (tt2-tt1);
   #endif
      
   return mem;
}

/* calloc() */
void* calloc(size_t n_elements, size_t elem_size)
{
   if(first_time == 0) malloc_init_state();  
   
   callocs++;
   
   #ifdef MALLOC_TIMING
   rdtscll(tt1);
   #endif
   
   ulong bytes = n_elements * elem_size;
   
   void* mem = getNewChunk(bytes, 2);
   
   if(mem != 0) memset((ulong*)(mem), 0, bytes);
      
   #ifdef MALLOC_TIMING
   rdtscll(tt2);
   callocTime += (tt2-tt1);
   #endif
      
   return mem;
}

/* realloc() */
void* realloc(void* oldmem, size_t bytes)
{
   if(first_time == 0) malloc_init_state();  
      
   if(oldmem == 0) return malloc(bytes);

   reallocs++;
   
   #ifdef MALLOC_TIMING
   rdtscll(tt1);
   #endif
   
   /* add random padding for obfuscation */
   if(bytes >= 512) bytes += (rand() % 65);
   
   /* align request size */
   ulong nb = req2size(bytes);   
   
   doRequest(3, nb, oldmem); 
   while(msgrcv(qid2, &Reply, msgSize, 0, IPC_NOWAIT) == -1) ; 
   
   if(Reply.value != 0) do_sbrk(Reply.value);
   
   if(oldmem != Reply.ptr && Reply.ptr != 0) 
      memcpy((ulong*)(Reply.ptr), (ulong*)(oldmem), bytes);
   
   #ifdef MALLOC_TIMING
   rdtscll(tt2);
   reallocTime += (tt2-tt1);   
   #endif
   
   return Reply.ptr;
}

/* free() */
void free(void* mem)
{
   if(first_time == 0) malloc_init_state();   
   
   frees++;
   
   #ifdef MALLOC_TIMING
   rdtscll(tt1);
   #endif
   
   if(mem == 0) return;
   
   #if LEVEL >= 2
   if(frees <= BULK_DEALLOC_THRESH) { doRequest(4, 0, mem); }
   else   /* store in the bulk deallocation structure */
   {
      /* store in dual copy structure */
      #if LEVEL >= 4
      bulkDeallocBackup[ptrsArrayPtr[0]] = mem;
      #endif
      
      bulkDeallocMsg.ptrs[ptrsArrayPtr[0]++] = mem;
      
      /* send bulk deallocation reuqest */
      if(ptrsArrayPtr[0] == BULK_DEALLOC_PTRS)
      {
         /* compare to dual copy */
	 #if LEVEL >= 4
         ulong i;
	 for(i=0; i<BULK_DEALLOC_PTRS; i++)
	    if(bulkDeallocBackup[i] != bulkDeallocMsg.ptrs[i])
	       errNexit("mismatched bulk deallocated pointers");
	 #endif
	 
	 doRequest(6, 0, 0);
         msgsnd(qid1, &bulkDeallocMsg, bulkDeallocMsgSize, 0);
         ptrsArrayPtr[0] = 0;
         bulkDeallocs++;
      }
   }
   
   #else
   { doRequest(4, 0, mem); }
   
   /* wait for acknowledgement */
   #if LEVEL == 0
   while(msgrcv(qid2, &Reply, msgSize, 0, IPC_NOWAIT) == -1) ;
   #endif
   
   #endif
   
   #ifdef MALLOC_TIMING
   rdtscll(tt2);
   freeTime += (tt2-tt1);
   #endif
}
void cfree(void *mem) { free(mem); return; }

/* malloc_stats() */
void malloc_stats()
{
   if(first_time == 0)  malloc_init_state();
  
   fprintf(stdout, "--------- malloc library stats (all in Bytes) ---------\n");
   fprintf(stdout, "App_Stats: mallocs/callocs/reallocs/frees/bulkDeallocs/preAllocs =\nAPP_STATS:\t%u\t%u\t%u\t%u\t%u\t%u\n",
                    mallocs, callocs, reallocs, frees, bulkDeallocs, preAllocs);
   fprintf(stdout, "App_Cycles: mallocs/callocs/reallocs/frees =\nAPP_CYCLES:\t%llu\t%llu\t%llu\t%llu\n",
                    mallocTime, callocTime, reallocTime, freeTime);
   
   doRequest(5, 0, 0);  
   while(msgrcv(qid2, &Reply, msgSize, 0, IPC_NOWAIT) == -1) ; 
   
   /* remove message queues */
   msgctl(qid1, IPC_RMID, 0);
   msgctl(qid2, IPC_RMID, 0);
   
   fprintf(stdout, "-------------------------------------------------------\n");

   /* wait till Heap Server exits */
   int *status;
   while(wait(&status) != pid) ;
   return;
}


/****************************************************************************/
/***************************** Heap Server **********************************/
/****************************************************************************/

void  HS_malloc(ulong);
void  HS_preAlloc(ulong);
void  HS_free(void*);
void  HS_realloc(void*, ulong);
void  HS_stats();

/* this macro sends a reply from the Heap Server to the application */
#define doReply(p)        \
   Reply.mtype = 2;       \
   Reply.value = to_sbrk; \
   Reply.ptr   = p;       \
   msgsnd(qid2, &Reply, msgSize, 0); \
   to_sbrk = 0;

/* main Heap Server loop */
void heapServer()
{
   while(1)
   {
      /* block on the request queue waiting for a request */
      #ifdef MALLOC_TIMING
      rdtscll(tt1);
      #endif
      msgrcv(qid1, &Request, msgSize, 0, 0);
      #ifdef MALLOC_TIMING
      rdtscll(tt2);
      sleepTime += (tt2-tt1);
      #endif
         
      switch(Request.mtype)
      {
	 case 1:   mallocs++;                   /* malloc() */
                   #ifdef MALLOC_TIMING
		   rdtscll(tt1);
	 	   #endif
		   HS_malloc(Request.value);
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   mallocTime += (tt2-tt1);
                   #endif
		   break;

	 case 2:   callocs++;                   /* calloc() */
	           #ifdef MALLOC_TIMING
		   rdtscll(tt1);
	 	   #endif
		   HS_malloc(Request.value);
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   callocTime += (tt2-tt1);
                   #endif
		   break;
		
	 case 3:   reallocs++;                  /* realloc() */
	           #ifdef MALLOC_TIMING
		   rdtscll(tt1);
		   #endif
		   HS_realloc(Request.ptr, Request.value);
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   reallocTime += (tt2-tt1);
                   #endif
		   break;
      
         case 4:   frees++;                     /* free() */
	           #ifdef MALLOC_TIMING
		   rdtscll(tt1);
		   #endif
		   HS_free(Request.ptr);
		   #if LEVEL == 0
		   Reply.mtype = 1;
                   Reply.value = 0;
                   Reply.ptr   = 0;
                   msgsnd(qid2, &Reply, msgSize, 0);
		   #endif
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   freeTime += (tt2-tt1);
                   #endif
		   break;
	
         case 5:   HS_stats();                  /* malloc_stats() */
	           exit(0);
                   break;

#if LEVEL >= 2                                  /* bulk deallocation */
	 case 6:   bulkDeallocs++;
	           #ifdef MALLOC_TIMING
		   rdtscll(tt1);
	           #endif
		   msgrcv(qid1, &bulkDeallocMsg, bulkDeallocMsgSize, 0, 0);
	           ulong i;
		   for(i=0; i<BULK_DEALLOC_PTRS; i++) 
		      HS_free(bulkDeallocMsg.ptrs[i]);
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   bulkDeallocTime += (tt2-tt1);
                   #endif
		   break;
#endif

#if LEVEL >= 3                                  /* pre-allocation */     
         case 7:   preAllocs++;
	           #ifdef MALLOC_TIMING
		   rdtscll(tt1);
		   #endif
		   ulong idx = Request.value >> 3;
		   if(preAllocMsg[idx].mtype == 0)
		   {
		      HS_preAlloc(Request.value);
		      preAllocMsg[idx].mtype = 10;
		   }
		   preAllocMsg[idx].value = to_sbrk;
		   msgsnd(qid2, &preAllocMsg[idx], preAllocMsgSize, 0);
                   to_sbrk = 0;
		   HS_preAlloc(Request.value);
		   #ifdef MALLOC_TIMING
		   rdtscll(tt2);
                   preAllocTime += (tt2-tt1);
		   #endif
		   break;
#endif
	 
	 default:  errNexit("unknown request type");
      }
   }
}


/* this function initializes Heap Server */
/* this is the entry function when Heap Server is forked */
void HS_init_state(ulong hStart)
{
   fprintf(stdout,
   #if   LEVEL == 0
      "\n***** Mazen: Welcome to my FULLY-OPTIMIZED (Blocking) HEAP-SERVER library *****\n");
   #elif LEVEL == 1
      "\n***** Mazen: Welcome to my FULLY-OPTIMIZED (nonBlocking) HEAP-SERVER library *****\n");
   #elif LEVEL == 2
      "\n***** Mazen: Welcome to my FULLY-OPTIMIZED (nonBlocking + bulkDeallocation) HEAP-SERVER library *****\n");
   #elif LEVEL == 3
      "\n***** Mazen: Welcome to my FULLY-OPTIMIZED (nonBlocking + bulkDeallocation + preAllocation) HEAP-SERVER library *****\n");
   #elif LEVEL == 4
      "\n***** Mazen: Welcome to my FULLY-OPTIMIZED (nonBlocking + bulkDeallocation + preAllocation + dual-copies) HEAP-SERVER library *****\n");
   #endif
   
   #if LEVEL >= 2
   fprintf(stdout, "BULK_DEALLOC_PTRS = %u\n", BULK_DEALLOC_PTRS); 
   #endif
   
   #if LEVEL >= 3
   fprintf(stdout, "PRE_ALLOC_PTRS = %u\n", PRE_ALLOC_PTRS); 
   #endif
   
   /* create/initialize structures and stats */
   
   ulong i;
   
   for(i=0; i<NBINS; i++) { bins[i].address = 0; bins[i].ptr = 0; }
   
   for(i=0; i<64; i++)    { preAllocMsg[i].mtype = 0; }

   pagesize = getpagesize();
   pagemask = pagesize - 1;

   dev_zero_fd = open("/dev/zero", O_RDWR);
   
   memMap = (struct memMapWord*)sbrk(pagesize);
   if(memMap == (struct memMapWord*)(-1)) errNexit("memMap allocation failed");
   bitMapSize = pagesize;

   mallocTime      = callocTime   = reallocTime = freeTime = 0;
   mallocs         = callocs      = reallocs    = frees    = 0;
   bulkDeallocTime = preAllocTime = sleepTime   = 0;
   bulkDeallocs    = preAllocs    = 0;
   max_foot_print  = newChunkReq  = to_sbrk     = 0;
   
   heapStart = hStart;
   top       = hStart;
   topSize   = 0;
   
   heapServer();
}


/* this function returns a pointer to a new chunk */
void HS_malloc(ulong nb)
{
   ulong         idx;              /* associated bin index */
   mbinptr       bin;              /* associated bin */
   mchunkptr     victim, temp;     /* inspected/selected chunk */
   ulong         size;             /* its size */
   ulong         p;                /* address of selected chunk */
   ulong         remainder;        /* address of remainder from a split */
   ulong         remainder_size;   /* its size */
   ulong         rnd, i;           /* for obfuscation */
   
   /* uses a best-fit LIFO policy in re-allocating chunks */
   
   /* check smallbins first if possible */
   if(in_smallbin_range(nb)) 
   {
      idx    = smallbin_index(nb);
      victim = &bins[idx];

      if(victim->ptr != 0 && victim->address >= MIN_CHUNKS_TO_USE_BIN) 
      {
	 /* choose a random chunk for layout obfuscation */
	 if(victim->address > MAX_RAND_OBF) rnd = MAX_RAND_OBF;
	 else                               rnd = victim->address;
	 rnd = rand() % rnd;
	 for(i=0; i<rnd; i++) victim = victim->ptr;
	 
	 p = victim->ptr->address;
	 #if LEVEL >= 1
	 doReply(chunk2mem(p));
	 #endif
	 setInUse(p, nb);
         unlink(victim, temp);
	 bins[idx].address--;
	 #if LEVEL == 0
	 doReply(chunk2mem(p));
	 #endif
         return;
      }
      idx++;
   }
   else idx = largebin_index(nb);

   /* traverse and search all remaining bins */
   for(idx; idx < NBINS; idx++)
   {
      bin = &bins[idx];
      for(victim = bin; victim->ptr != 0; victim = victim->ptr) 
      {
         p    = victim->ptr->address;
	 size = chunkSize(p);
      
         if(size >= nb) 
	 {
            #if LEVEL >= 1
	    doReply(chunk2mem(p));
            #endif
	    
	    remainder_size = size - nb;
            unlink(victim, temp);
	    bins[idx].address--;
            
	    if(remainder_size < MINSIZE) setInUse(p, size);
            else 
	    {
               remainder = p + nb;
	       writeSize(p, nb, 1);
	       writeSize(remainder, remainder_size, 0);
               add_to_bin(remainder, remainder_size);
	    }
            
	    #if LEVEL == 0
	    doReply(chunk2mem(p));
	    #endif
          
	    return;
         }
      }    
   }

   /* use top to satisfy the request */
   p  = top;

   /* not enough space in top */
   if(topSize < nb + MINSIZE)
   {
      size = (nb + MINSIZE - topSize + pagemask) & ~pagemask;
      if(size < 262144) size = 262144;

      to_sbrk += (int)size;
      
      #if LEVEL >= 1
      doReply(chunk2mem(p));
      #endif
      
      topSize += size;

      ulong foot_print = top + topSize - heapStart;
      if(foot_print > max_foot_print) max_foot_print = foot_print;

      ulong newSize = foot_print >> 5;
      if(newSize > bitMapSize)
      {
         newSize = (newSize + pagemask) & ~pagemask;
         sbrk(newSize - bitMapSize);
	 bitMapSize = newSize;
      }
   }
   #if LEVEL >= 1
   else { doReply(chunk2mem(p)); }
   #endif

   top      = p + nb;
   topSize -= nb;
   writeSize(p, nb, 1);
   
   #if LEVEL == 0
   doReply(chunk2mem(p));
   #endif
	 
   return;
}


/* this function fills the pre-allocation message with pre-allocated chunks */
#if LEVEL >= 3
void HS_preAlloc(ulong nb)
{
   ulong      idx, size, fromTop, p, i;
   mchunkptr  victim, temp;
   
   if(!in_smallbin_range(nb))
      errNexit("pre-allocation request of a large size");
   
   i = 0;
   
   idx    = smallbin_index(nb);
   victim = &bins[idx];

   /* try to fill from appropriate bin first */
   while(victim->ptr != 0 && i < PRE_ALLOC_PTRS) 
   {
      p = victim->ptr->address;
      setInUse(p, nb);
      unlink(victim, temp);
      preAllocMsg[idx].ptrs[i++] = chunk2mem(p);
   }
   
   bins[idx].address = bins[idx].address - i;
   
   if(i == PRE_ALLOC_PTRS) return;

   /* if request still not fully satisfied, allocate all remaining from top */
   
   fromTop = nb * (PRE_ALLOC_PTRS - i);

   if(topSize < fromTop + MINSIZE)
   {
      size = (fromTop + MINSIZE - topSize + pagemask) & ~pagemask;
      if(size < 262144) size = 262144;

      to_sbrk += (int)size;      
      topSize += size;

      ulong foot_print = top + topSize - heapStart;
      if(foot_print > max_foot_print) max_foot_print = foot_print;
      
      ulong newSize = foot_print >> 5;
      if(newSize > bitMapSize)
      {
         newSize = (newSize + pagemask) & ~pagemask;
         sbrk(newSize - bitMapSize);
	 bitMapSize = newSize;
      }
   }
   
   p  = top;   
   
   while(i < PRE_ALLOC_PTRS)
   {
      preAllocMsg[idx].ptrs[i++] = chunk2mem(p);
      writeSize(p, nb, 1);
      p += nb;
   }

   top      = p;
   topSize -= fromTop;

   return;
}
#endif


/* this function deallocates a chunk */
void HS_free(void* mem)
{
   ulong   p;             /* chunk corresponding to mem */
   ulong   size;          /* its size */
   ulong   nextchunk;     /* next contiguous chunk */
   ulong   nextsize;      /* its size */
   ulong   prevsize;      /* size of previous contiguous chunk */
   int     extra;         /* total bytes to trim from top */

   p    = mem2chunk(mem);
   size = chunkSize(p);

   nextchunk = p + size;
   
   prevsize = nextsize = 0;
   
   /* consolidate backward */
   if(size > MAX_FAST_SZ) prevsize = prevSize(p);
   if(prevsize > MAX_FAST_SZ)
   {
      p     = p - prevsize; 
      size += prevsize;
      remove_from_bin(p, prevsize);
   }

   if(nextchunk != top) 
   {
      if(size > MAX_FAST_SZ) nextsize = chunkSizeNused(nextchunk);
      
      /* consolidate forward */
      if(nextsize > MAX_FAST_SZ) 
      {
         remove_from_bin(nextchunk, nextsize); 
	 size += nextsize;
      }

      writeSize(p, size, 0);
      add_to_bin(p, size);
   }

   /* if the chunk borders the current top, consolidate into top */
   else 
   {
      top      = p;
      topSize += size;

      /* if the total unused topmost memory exceeds trim threshold, trim top. */
      if(topSize >= TOPTRIM && topSize > TOPSLACK)
      {
         extra   = topSize - TOPSLACK;
         to_sbrk = to_sbrk - extra;
	 topSize = TOPSLACK;
      }
   }
}


/* this function performs re-allocation */
void HS_realloc(void* oldmem, ulong nb)
{
   ulong oldsize; /* old chunk's size */

   oldsize = chunkSize((ulong)oldmem); 
   
   if(oldsize >= nb) 
   {
      doReply(oldmem);
      return;
   }

   HS_free(oldmem);
   HS_malloc(nb);
   
   return;
}


/* this function prints Heap Server's stats */
void HS_stats()
{
   fprintf(stdout, "HS_Stats: mallocs/callocs/reallocs/frees/bulkDeallocs/preAllocs/max_foot_print/current_foot_print/bitMapSize/mmappedChunks =\nHS_STATS:\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\n",
                    mallocs, callocs, reallocs, frees, bulkDeallocs, preAllocs, max_foot_print, (top+topSize-heapStart), bitMapSize, (newChunkReq*NUM_CHUNKS));
   fprintf(stdout, "HS_Cycles: mallocs/callocs/reallocs/frees/bulkDealloc/preAlloc/sleep =\nHS_CYCLES:\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\n",
                    mallocTime, callocTime, reallocTime, freeTime, bulkDeallocTime, preAllocTime, sleepTime);

   doReply(0);
   return;
}


/************************* Non-Implemented Functions ************************/

void notImplemented(char *name)
{ fprintf(stderr, "Mazen: %s not implemented\n", name); exit(0); }

void** independent_calloc(size_t n_elements, size_t elem_size, void* chunks[])
{ notImplemented("independent_calloc"); }
void** independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[])
{ notImplemented("independent_comalloc"); }

void*  memalign(size_t align, size_t bytes) { notImplemented("memalign"); }
void*  valloc(size_t bytes)                 { notImplemented("valloc"); }
void*  pvalloc(size_t bytes)                { notImplemented("pvalloc"); }
int    malloc_trim(size_t pad)              { notImplemented("malloc_trim"); }
size_t malloc_usable_size(void* mem)        { notImplemented("malloc_size"); }
int    mallopt(int param_number, int value) { notImplemented("mallopt"); }

