<pre>
// mandelbrot program, by Derek Zahn
// SSE scalar code adapted from some code found on the net somewhere

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include "glut.h"

#define MAX_CORES 8

#define MIN_FLOATINC  0.0000001            // max zoom level for float
#define MIN_DOUBLEINC 0.0000000000000003   // max zoom level for double

int g_nWidth;          // screen size
int g_nHeight;

int g_nRenderWidth;    // size of our render
int g_nRenderHeight;

int g_bDouble;         // double or single precision

float g_fCenterX;      // float, x coordinate of center of screen
float g_fCenterY;      // float, y coordinate of center of screen
float g_fInc;          // float, increment per screen pixel
float g_fRenderInc;    // float, increment per render pixel

double g_dCenterX;     // float, x coordinate of center of screen
double g_dCenterY;     // double, y coordinate of center of screen
double g_dInc;         // double, increment per screen pixel
double g_dRenderInc;   // double, increment per render pixel

// this program uses a LOD (level of detail) to keep frame rates up
// while panning and zooming.  The size of the image rendered
// is screensize/g_nLOD

int g_nMaximumLOD = 2; // maximum LOD divisor
int g_nLOD;            // current LOD divisor

// computation is broken up into 16x16 grids, both to give bite-size
// units to the processors, and to cut down accumulated errors from
// the increments
double g_dGridX[256];  // grid coordinates
double g_dGridY[256];
int g_nGridWidth;      // number of grids
int g_nGridHeight;

#define ITERATIONS 256             // number of iterations before quitting

HANDLE g_hCoreThread[MAX_CORES];   // worker thread handles
HANDLE g_hWorkToDo[MAX_CORES];     // mutex for work in progress
HANDLE g_hWorkDone[MAX_CORES];     // mutex for work finished
BOOL g_bExit;                      // thread exit flag
int g_nCores;                      // number of cores -> number of worker threads
BYTE *g_pucCounts;                 // result from computations: iteration counts
BYTE *g_pucCountsInProgress;       // results while the threads are working
BYTE *g_pucPixels;                 // RGBA render buffer

// mouse stuff
int g_nMouseX;
int g_nMouseY;
int g_bLeftDown = 0;
int g_bRightDown = 0;

// compute the grid coordinates for the frame

void ComputeGrid()
{
  int n;
  double dGridStart;

  g_nGridWidth = (g_nRenderWidth + 15) / 16;
  dGridStart = g_dCenterX - (g_nRenderWidth / 2) * g_dRenderInc;
  for(n = 0; n < g_nGridWidth; n++)
    g_dGridX[n] = dGridStart + n * 16 * g_dRenderInc;
  g_nGridHeight = (g_nRenderHeight + 15) / 16;
  dGridStart = g_dCenterY - (g_nRenderHeight / 2) * g_dRenderInc;
  for(n = 0; n < g_nGridHeight; n++)
    g_dGridY[n] = dGridStart + n * 16 * g_dRenderInc;
}

// fly the point of view around depending on what the mouse is doing

void UpdatePosition()
{
  int nDiff;

  g_nLOD = 1;
  if(g_bLeftDown || g_bRightDown)
  {
    if(g_bLeftDown && !g_bRightDown)
    {
      g_dInc = g_dInc * 0.96;
      if(!g_bDouble && (g_dInc < MIN_FLOATINC))
        g_dInc = MIN_FLOATINC;
      else if(g_bDouble && (g_dInc < MIN_DOUBLEINC))
        g_dInc = MIN_DOUBLEINC;
      g_fInc = (float) g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
    else if(!g_bLeftDown && g_bRightDown)
    {
      g_dInc = g_dInc * 1.04;
      if(!g_bDouble && (g_dInc < MIN_FLOATINC))
        g_dInc = MIN_FLOATINC;
      else if(g_bDouble && (g_dInc < MIN_DOUBLEINC))
        g_dInc = MIN_DOUBLEINC;
      g_fInc = (float) g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
    nDiff = g_nMouseX - g_nWidth/2;
    if(nDiff > 10)
    {
      nDiff = (nDiff - 10) / 10;
      g_dCenterX += ((double) nDiff) * g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
    else if(nDiff < -10)
    {
      nDiff = (-nDiff - 10) / 10;
      g_dCenterX -= ((double) nDiff) * g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
    nDiff = g_nMouseY - g_nHeight/2;
    if(nDiff > 10)
    {
      nDiff = (nDiff - 10) / 10;
      g_dCenterY -= ((double) nDiff) * g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
    else if(nDiff < -10)
    {
      nDiff = (-nDiff - 10) / 10;
      g_dCenterY += ((double) nDiff) * g_dInc;
      g_nLOD = g_nMaximumLOD;
    }
  }

  g_nRenderWidth = g_nWidth / g_nLOD;
  g_nRenderHeight = g_nHeight / g_nLOD;
  g_dRenderInc = g_dInc * ((double) g_nLOD);
  g_fRenderInc = (float) g_dRenderInc;
  g_fCenterX = (float) g_dCenterX;
  g_fCenterY = (float) g_dCenterY;
}

//////////////////////////////////////
//
// Translation of iteration counts to colors
//
//////////////////////////////////////

__inline int XX(int n)
{
  n = n & 0x1FF;
  if(n >= 256)
    n = 512 - n;
  return n;
}

__inline void SetPixVal(int nX, int nY, int nR, int nG, int nB)
{
  BYTE *pOff;
  pOff = &g_pucPixels[(nY*g_nRenderWidth+nX) * 4];
  pOff[0] = (BYTE) nR;
  pOff[1] = (BYTE) nG;
  pOff[2] = (BYTE) nB;
  pOff[3] = 0xFF;
}

__inline void SetPix(int nX, int nY, int nIter)
{
  SetPixVal(nX, nY, XX(nIter*5), XX(nIter*7), XX(nIter*11));
}

//////////////////////////////////////
//
// Stupid but readable versions of the code, will run on anything but are slow
//
//////////////////////////////////////

int CountIterationsFloat(float fInitX, float fInitY)
{
    
    float fCx, fCy, fXSquared, fYSquared, fCxCy;
    int nIteration;
    
    fCx = fInitX + fInitX * fInitX - fInitY * fInitY;
    fCy = fInitY + fInitX * fInitY + fInitX * fInitY;
    
    for(nIteration=0; nIteration < ITERATIONS; nIteration++)
    {
      fYSquared = fCy * fCy;
      fXSquared = fCx * fCx;
      if(fXSquared + fYSquared >= 4.0f)
        break;
      fCxCy = fCx * fCy;
      fCy = fInitY + fCxCy + fCxCy;
      fCx = fInitX - fYSquared + fXSquared;
    }
    return nIteration;
}

int CountIterationsDouble(double fInitX, double fInitY)
{
    
    double fCx, fCy, fXSquared, fYSquared;
    int nIteration;
    
    fCx = fInitX + fInitX * fInitX - fInitY * fInitY;
    fCy = fInitY + fInitX * fInitY + fInitX * fInitY;
    
    for(nIteration=0; nIteration < ITERATIONS; nIteration++)
    {
      fYSquared = fCy * fCy;
      fXSquared = fCx * fCx;
      if(fXSquared + fYSquared >= 4.0f)
        break;
      fCy = fInitY + fCx * fCy + fCx * fCy;
      fCx = fInitX - fYSquared + fXSquared;
    }
    return nIteration;
}

//////////////////////////////////////
//
// SSE2 computational units for plain CPU computation
//
//////////////////////////////////////

float fFour[4] = { 4.0f, 4.0f, 4.0f, 4.0f };

void ComputeFloat(float *pfX, float *pfY, float *pfCounts)
{
  float *pf = fFour;

  __asm {
  ; xmm4 should contain 'startx'
  ; xmm5 should contain 'starty'

  ; xmm0 = zx
  ; xmm1 = zy

  ; xmm7 is initialized with four values of 4.0f

  mov eax, pfX
  movups xmm4, [eax]
  mov eax, pfY
  movups xmm5, [eax]
  mov eax, pf
  movups xmm7, [eax]

  MOVAPS xmm0,xmm4   ; xmm0 = zx
  XORPS  xmm6,xmm6   ; xmm6(iterations) = 0
  MOVAPS xmm1,xmm5   ; xmm1 = 
  mov ecx, ITERATIONS    ; ecx is max iteration count
ITERATIONLOOP:
  MOVAPS xmm2, xmm0  ; xmm2 = zx
  MULPS  xmm0, xmm0  ; xmm0 = zx * zx
  MOVAPS xmm3, xmm1  ; xmm3 = zy
  ADDPS  xmm1, xmm1  ; xmm1 = 2*zy
  MULPS  xmm1, xmm2  ; xmm1 = 2*zx*zy
  MOVAPS xmm2, xmm0  ; xmm2 = zx * zx
  MULPS  xmm3, xmm3  ; xmm3 = zy * zy
  ADDPS  xmm1, xmm5  ; xmm1 = 2*zx*zy + starty
  SUBPS  xmm0, xmm3  ; xmm0 = zx*zx-zy*zy
  ADDPS  xmm2, xmm3  ; xmm2 = zx*zx+zy*zy
  CMPLEPS xmm2, xmm7 ; xmm2 will be zero if xmm2's values were greater than 4.0
  ADDPS   xmm0, xmm4 ; xmm0 = zx*zx-zy*zy+startx
  MOVMSKPS eax, xmm2 ; moves 4 sign bits from xmm2 values into eax
  test eax, eax
  jz ALLDONE         ; all four have exited
  ANDPS   xmm2, xmm7 ; xmm2 gets 4.0 added for each one that has not finished
  ADDPS   xmm6, xmm2 ; increment running count
  sub ecx, 1
  jnz ITERATIONLOOP  ; quit if max reached
ALLDONE:
   mov eax, pfCounts
   movups [eax], xmm6
  }
}

double dFour[2] = { 4.0, 4.0 };

void ComputeD(double *pfX, double *pfY, double *pfCounts)
{
  double *pf = dFour;

  __asm {
  ; xmm4 should contain 'startx'
  ; xmm5 should contain 'starty'

  ; xmm0 = zx
  ; xmm1 = zy

  ; xmm7 is initialized with two values of 4.0

  mov eax, pfX
  movupd xmm4, [eax]
  mov eax, pfY
  movupd xmm5, [eax]
  mov eax, pf
  movupd xmm7, [eax]

  MOVAPD xmm0,xmm4   ; xmm0 = zx
  XORPD  xmm6,xmm6   ; xmm6(iterations) = 0
  MOVAPD xmm1,xmm5   ; xmm1 = 
  mov ecx, ITERATIONS    ; ecx is max iteration count
ITERATIONLOOP:
  MOVAPD xmm2, xmm0  ; xmm2 = zx
  MULPD  xmm0, xmm0  ; xmm0 = zx * zx
  MOVAPD xmm3, xmm1  ; xmm3 = zy
  ADDPD  xmm1, xmm1  ; xmm1 = 2*zy
  MULPD  xmm1, xmm2  ; xmm1 = 2*zx*zy
  MOVAPD xmm2, xmm0  ; xmm2 = zx * zx
  MULPD  xmm3, xmm3  ; xmm3 = zy * zy
  ADDPD  xmm1, xmm5  ; xmm1 = 2*zx*zy + starty
  SUBPD  xmm0, xmm3  ; xmm0 = zx*zx-zy*zy
  ADDPD  xmm2, xmm3  ; xmm2 = zx*zx+zy*zy
  CMPLEPD xmm2, xmm7 ; xmm2 will be zero if xmm2's values were greater than 4.0
  ADDPD   xmm0, xmm4 ; xmm0 = zx*zx-zy*zy+startx
  MOVMSKPD eax, xmm2 ; moves 4 sign bits from xmm2 values into eax
  test eax, eax
  jz ALLDONE         ; all four have exited
  ANDPD   xmm2, xmm7 ; xmm2 gets 4.0 added for each one that has not finished
  ADDPD   xmm6, xmm2 ; increment running count
  sub ecx, 1
  jnz ITERATIONLOOP  ; quit if max reached
ALLDONE:
   mov eax, pfCounts
   movupd [eax], xmm6
  }
}

void ComputeDouble(double *pdX, double *pdY, double *pdCounts)
{
  ComputeD(pdX, pdY, pdCounts);
  ComputeD(&pdX[2], &pdY[2], &pdCounts[2]);
}


void FillFloatGrid(int nGridX, int nGridY)
{
  int nX, nY;
  float fY;
  int nIteration;
  float fYs[4];
  float fXs[4];
  float fResult[4];
  BYTE *pucLine;

  for(nY = 0; nY < 16; nY++)
  {
    fY = (float) (g_dGridY[nGridY] + nY * g_dRenderInc);
    fYs[0] = fY;
    fYs[1] = fY;
    fYs[2] = fY;
    fYs[3] = fY;

    pucLine = g_pucCountsInProgress + g_nRenderWidth * (nY + nGridY * 16) + nGridX * 16;
    for(nX = 0; nX < 16; nX += 4)
    {
      fXs[0] = (float) (g_dGridX[nGridX] + nX * g_dRenderInc);
      fXs[1] = (float) (g_dGridX[nGridX] + (nX+1) * g_dRenderInc);
      fXs[2] = (float) (g_dGridX[nGridX] + (nX+2) * g_dRenderInc);
      fXs[3] = (float) (g_dGridX[nGridX] + (nX+3) * g_dRenderInc);
      ComputeFloat(fXs, fYs, fResult);
      nIteration = ((int) fResult[0]) >> 2;
      if(nIteration == ITERATIONS) nIteration = 0;
      *pucLine++ = (BYTE) nIteration;
      nIteration = ((int) fResult[1]) >> 2;
      if(nIteration == ITERATIONS) nIteration = 0;
      *pucLine++ = (BYTE) nIteration;
      nIteration = ((int) fResult[2]) >> 2;
      if(nIteration == ITERATIONS) nIteration = 0;
      *pucLine++ = (BYTE) nIteration;
      nIteration = ((int) fResult[3]) >> 2;
      if(nIteration == ITERATIONS) nIteration = 0;
      *pucLine++ = (BYTE) nIteration;
    }
  }
}

void FillFloat(int nCore)
{
  int nX, nY;
  int nSlot = 0;
  for(nY = 0; nY < g_nGridHeight; nY++)
  {
    for(nX = 0; nX < g_nGridWidth; nX++)
    {
      if(nSlot % g_nCores == nCore)
        FillFloatGrid(nX, nY);
      nSlot++;
    }
  }
}

void FillDoubleGrid(int nGridX, int nGridY)
{
  int nX, nY;
  double fY;
  int nIter;
  double fYs[4];
  double fXs[4];
  double fResult[4];
  BYTE *pucLine;

  for(nY = 0; nY < 16; nY++)
  {
    fY = g_dGridY[nGridY] + nY * g_dRenderInc;
    fYs[0] = fY;
    fYs[1] = fY;
    fYs[2] = fY;
    fYs[3] = fY;

    pucLine = g_pucCountsInProgress + g_nRenderWidth * (nY + nGridY * 16) + nGridX * 16;
    for(nX = 0; nX < 16; nX += 4)
    {
      fXs[0] = g_dGridX[nGridX] + nX * g_dRenderInc;
      fXs[1] = g_dGridX[nGridX] + (nX+1) * g_dRenderInc;
      fXs[2] = g_dGridX[nGridX] + (nX+2) * g_dRenderInc;
      fXs[3] = g_dGridX[nGridX] + (nX+3) * g_dRenderInc;
      ComputeDouble(fXs, fYs, fResult);
      nIter = ((int) fResult[0]) >> 2;
      if(nIter == ITERATIONS) nIter = 0;
      *pucLine++ = (BYTE) nIter;
      nIter = ((int) fResult[1]) >> 2;
      if(nIter == ITERATIONS) nIter = 0;
      *pucLine++ = (BYTE) nIter;
      nIter = ((int) fResult[2]) >> 2;
      if(nIter == ITERATIONS) nIter = 0;
      *pucLine++ = (BYTE) nIter;
      nIter = ((int) fResult[3]) >> 2;
      if(nIter == ITERATIONS) nIter = 0;
      *pucLine++ = (BYTE) nIter;
    }
  }
}

void FillDouble(int nCore)
{
  int nX, nY;
  int nSlot = 0;
  for(nY = 0; nY < g_nGridHeight; nY++)
  {
    for(nX = 0; nX < g_nGridWidth; nX++)
    {
      if(nSlot % g_nCores == nCore)
        FillDoubleGrid(nX, nY);
      nSlot++;
    }
  }
}

//////////////////////////////////////
//
// Computation and threading functions
//
//////////////////////////////////////

// The computation thread.  There is one of these per core for
// smokin' multiprocessor action

DWORD WINAPI ComputationThread(LPVOID lpParameter)
{
  int nCore = (int) lpParameter;

  while(!g_bExit)
  {
    WaitForSingleObject(g_hWorkToDo[nCore], INFINITE);
    if(g_bExit)
      break;
    // compute!

    if(g_bDouble)
      FillDouble(nCore);
    else
      FillFloat(nCore);

    SetEvent(g_hWorkDone[nCore]);
  }
  return 0;
}

void InitComputation()
{
  DWORD dwProcAffinity = 0;
  DWORD dwSystemAffinity = 0;
  HANDLE hProc;
  int n;

  // count the number of cores
  g_nCores = 1;
  hProc = OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId());
  GetProcessAffinityMask(hProc, &dwProcAffinity, &dwSystemAffinity);
  CloseHandle(hProc);
  for(n = 0; n < MAX_CORES; n++)
  {
    if(dwProcAffinity & (1 << n))
      g_nCores = n + 1;
  }

  // create the data
  g_pucCounts = (BYTE *) malloc((g_nWidth+16) * (g_nHeight + 16));
  g_pucCountsInProgress = (BYTE *) malloc((g_nWidth+16) * (g_nHeight+16));
  g_pucPixels = (BYTE *) malloc((g_nWidth+16) * (g_nHeight+16) * 4);

  // create synchronization objects and the threads that use them
  g_bExit = FALSE;
  for(n = 0; n < g_nCores; n++)
  {
    g_hWorkToDo[n] = CreateEvent(NULL, FALSE, FALSE, NULL);
    g_hWorkDone[n] = CreateEvent(NULL, FALSE, FALSE, NULL);
    g_hCoreThread[n] = CreateThread(NULL, 0, ComputationThread, (LPVOID) n, 0, NULL);
    SetThreadIdealProcessor(g_hCoreThread[n], (DWORD) n); // hint to system about what we're up to
  }
}

void CleanupComputation()
{
  int n;

  g_bExit = TRUE;     // tell the threads to exit
  for(n = 0; n < g_nCores; n++)
    SetEvent(g_hWorkToDo[n]); // nudge the threads so they will see the exit flag
  // wait for all threads to exit, and clean up
  for(n = 0; n < g_nCores; n++)
  {
    WaitForSingleObject(g_hCoreThread[n], INFINITE);
    CloseHandle(g_hCoreThread[n]);
    CloseHandle(g_hWorkDone[n]);
    CloseHandle(g_hWorkToDo[n]);
  }
  
  // cleanup the data
  free(g_pucCounts);
  free(g_pucCountsInProgress);
  free(g_pucPixels);
}

void StartComputation()
{
  int n;
  ComputeGrid();
  // let loose the dogs!
  for(n = 0; n < g_nCores; n++)
    SetEvent(g_hWorkToDo[n]);
}

void WaitComputation()
{
  int n;
  BYTE *pucTemp;
  // wait for the dogs to finish barking
  for(n = 0; n < g_nCores; n++)
    WaitForSingleObject(g_hWorkDone[n], INFINITE);
  // swap the buffers so we can process them at leisure
  pucTemp = g_pucCounts;
  g_pucCounts = g_pucCountsInProgress;
  g_pucCountsInProgress = pucTemp;
}

//////////////////////////////////////
//
// Glut-based UI Functions, including display callback
//
//////////////////////////////////////
//

DWORD g_dwCycle = 0;

void Display(void)
{
  BYTE *pucIter;
  int nX, nY;

  UpdatePosition();   // update the center coordinate, zoom, etc
  StartComputation(); // let loose the machinery

  WaitComputation(); // wait for the current frame to finish computing
  g_dwCycle = GetTickCount();

  // convert the iteration counts to frame buffer color values
  pucIter = g_pucCounts;
  for(nY = 0; nY < g_nRenderHeight; nY++)
  {
    for(nX = 0; nX < g_nRenderWidth; nX++)
    {
      SetPix(nX, nY, *pucIter);
      pucIter++;
    }
  }

  // GL grunk
  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
  // set the pixel data as a texture; we'll use it
  // to draw a textured quad covering the whole screen later
  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, g_nRenderWidth, g_nRenderHeight, 0,
      GL_RGBA, GL_UNSIGNED_BYTE, g_pucPixels);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_WRAP_S, GL_CLAMP);
  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_WRAP_T, GL_CLAMP);

  // the video card is good at this sort of stuff, so let it
  // stretch the image to the screen, in case we're in
  // a reduced-LOD mode
  glEnable(GL_TEXTURE_2D);
  glBegin(GL_POLYGON);
  glTexCoord2f(0.0f, 0.0f); glVertex2f(-1.0f, -1.0f);
  glTexCoord2f(1.0f, 0.0f); glVertex2f(1.0f, -1.0f);
  glTexCoord2f(1.0f, 1.0f); glVertex2f(1.0f, 1.0f);
  glTexCoord2f(0.0f, 1.0f); glVertex2f(-1.0f, 1.0f);
  glEnd();
  glFlush();

  glutSwapBuffers();
  glutPostRedisplay();
}

void Keyboard(unsigned char key, int nX, int nY)
{
  if(key == 27) // if user hits escape, he has become bored
  {
    CleanupComputation();
    exit(0);
  }
}

void Mouse(int nButton, int nState, int nX, int nY)
{
  if(nButton == 0)
  {
    if(nState == 0)
      g_bLeftDown = 1;
    else
      g_bLeftDown = 0;
  }
  else if(nButton == 2)
  {
    if(nState == 0)
      g_bRightDown = 1;
    else
      g_bRightDown = 0;
  }
  g_nMouseX = nX;
  g_nMouseY = nY;
}

void Motion(int nX, int nY)
{
  g_nMouseX = nX;
  g_nMouseY = nY;
}

//////////////////////////////////////
//
// Main program
//
//////////////////////////////////////

int main(int argc, char *argv[])
{
  char szModeString[50];

  glutInit(&argc, argv);
  glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);

  g_nWidth = GetSystemMetrics(SM_CXSCREEN);
  g_nHeight = GetSystemMetrics(SM_CYSCREEN);

  sprintf_s(szModeString, sizeof(szModeString), "%dx%d:%d", g_nWidth, g_nHeight, 32);
  glutGameModeString(szModeString);

  glutEnterGameMode();

  g_nWidth = glutGameModeGet(GLUT_GAME_MODE_WIDTH);
  g_nHeight = glutGameModeGet(GLUT_GAME_MODE_HEIGHT);

  g_nLOD = 1;  // render the first frame full-resolution

  InitComputation();

  g_nRenderWidth = g_nWidth / g_nLOD;
  g_nRenderHeight = g_nHeight / g_nLOD;

  // set up the starting view position
  g_dCenterX = -0.6;
  g_fCenterX = (float) g_dCenterX;
  g_dCenterY = 0.0;
  g_fCenterY = (float) g_dCenterY;
  g_dInc = 4.0 / ((double) g_nRenderHeight);
  g_dRenderInc = g_dInc * ((double) g_nLOD);
  g_fInc = (float) g_dInc;
  g_fRenderInc = (float) g_dRenderInc;

  g_bDouble = 1;  // double precision.  for twice as fast single precision, set it to 0

  glutDisplayFunc(Display);
  glutKeyboardFunc(Keyboard);
  glutMouseFunc(Mouse);
  glutMotionFunc(Motion);

  glutMainLoop(); // go!

  CleanupComputation();
  return 0;
}
</pre>

