Coding Challenge I

Pages: 1 2 3 4 5 6 7 8 9

Solution from RU

The “8vecpi.c” program was compiled using: Intel(R) C++ Compiler for 32-bit applications, Version 6.0 Build 020321Z Copyright (C) 1985-2002 Intel Corporation. All rights reserved. The flags used to generate “8vecpi.exe” are: /Ox /Ot /Oy /QIfist /G6 /QaxMKW /GA /Qpc32 /Qvec_report3 /Qopt_report

The most significant speed improvement is from /QIfist which provides fast float->int conversions. Unfortunately I could not get the compiler to use cvttps2pi, so without /QIfist the conversions consume the majority of execution time.

This code was used as a stepping stone of sorts to get to the Assembler version. The code has many changes other than the unrolling:

  • – unsigned integers used in place of signed as Athlon executes 10% faster
  • – ‘q’ used instead of ‘a’ as storage array (loop interchange)
  • – fwrite was used instead of fputs (prevents the NEED for null-term)
  • – memcpy and memset used in place of for loops
  • – function inlined since it was guaranteed to be called, and only once
  • – ‘nines’ initialized to 1
  • – mallocs combined, and q[] guaranteed allocated to 256B boundary
  • – single-precision floating point used as FP math is faster than INT
  • – x87 control word set to truncate (to allow /QIfist)
  • – div/mod combos replaced with div and sub/mul combo since compiler seemed to like using two DIVs for the div/mod combo.
  • – lots of multiplies replaced with shifts and adds

In order to accomplish getting vectors some initialization and cleanup was needed, essentially creating a pipeline of sorts.”

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>

int main(int argc, char** argv)
{
    unsigned int numdigits;
    unsigned char* pi;
    unsigned int alength;
	unsigned int a[8], atmp;
    unsigned int piLength = 0;
    unsigned int nines = 1;
    unsigned int predigit = 0;
    unsigned int i, j;
    unsigned int *q;
	unsigned int qtmp, p, x;
	FILE* fp;
	_control87( _RC_CHOP, _MCW_RC );

    if (argc <= 1) numdigits=20000;
    else numdigits = atoi(argv[1]);
    
	pi = (unsigned char*) calloc(sizeof(unsigned char)+sizeof(unsigned int),numdigits+512);
	q = (unsigned int*)&pi[(numdigits + sizeof(unsigned int)*(256))&0xffffff00];
	alength = ((numdigits<<3)+(numdigits<<1))/3;

	p=(alength<<1)-1;
	i=alength;
	if (numdigits>8){
		numdigits-=7;
		for (;i>8;i-=8){
			x = 20 + q[0]*i;
			q[0]=x/p;
			a[0]=x-(q[0]*p);

			x = 20 + q[0]*(i-1);
			q[0]=x/(p-2);
			a[1]=x-(q[0]*(p-2));

			x = 20 + q[0]*(i-2);
			q[0]=x/(p-4);
			a[2]=x-(q[0]*(p-4));

			x = 20 + q[0]*(i-3);
			q[0]=x/(p-6);
			a[3]=x-(q[0]*(p-6));

			x = 20 + q[0]*(i-4);
			q[0]=x/(p-8);
			a[4]=x-(q[0]*(p-8));

			x = 20 + q[0]*(i-5);
			q[0]=x/(p-10);
			a[5]=x-(q[0]*(p-10));

			x = 20 + q[0]*(i-6);
			q[0]=x/(p-12);
			a[6]=x-(q[0]*(p-12));


			x = (a[0]<<3) + (a[0]<<1) + q[1]*i;
			q[1]=x/p;
			a[0]=x-(q[1]*p);

			x = (a[1]<<3) + (a[1]<<1) + q[1]*(i-1);
			q[1]=x/(p-2);
			a[1]=x-(q[1]*(p-2));

			x = (a[2]<<3) + (a[2]<<1) + q[1]*(i-2);
			q[1]=x/(p-4);
			a[2]=x-(q[1]*(p-4));

			x = (a[3]<<3) + (a[3]<<1) + q[1]*(i-3);
			q[1]=x/(p-6);
			a[3]=x-(q[1]*(p-6));

			x = (a[4]<<3) + (a[4]<<1) + q[1]*(i-4);
			q[1]=x/(p-8);
			a[4]=x-(q[1]*(p-8));

			x = (a[5]<<3) + (a[5]<<1) + q[1]*(i-5);
			q[1]=x/(p-10);
			a[5]=x-(q[1]*(p-10));


			x = (a[0]<<3) + (a[0]<<1) + q[2]*i;
			q[2]=x/p;
			a[0]=x-(q[2]*p);

			x = (a[1]<<3) + (a[1]<<1) + q[2]*(i-1);
			q[2]=x/(p-2);
			a[1]=x-(q[2]*(p-2));

			x = (a[2]<<3) + (a[2]<<1) + q[2]*(i-2);
			q[2]=x/(p-4);
			a[2]=x-(q[2]*(p-4));

			x = (a[3]<<3) + (a[3]<<1) + q[2]*(i-3);
			q[2]=x/(p-6);
			a[3]=x-(q[2]*(p-6));

			x = (a[4]<<3) + (a[4]<<1) + q[2]*(i-4);
			q[2]=x/(p-8);
			a[4]=x-(q[2]*(p-8));


			x = (a[0]<<3) + (a[0]<<1) + q[3]*i;
			q[3]=x/p;
			a[0]=x-(q[3]*p);

			x = (a[1]<<3) + (a[1]<<1) + q[3]*(i-1);
			q[3]=x/(p-2);
			a[1]=x-(q[3]*(p-2));

			x = (a[2]<<3) + (a[2]<<1) + q[3]*(i-2);
			q[3]=x/(p-4);
			a[2]=x-(q[3]*(p-4));

			x = (a[3]<<3) + (a[3]<<1) + q[3]*(i-3);
			q[3]=x/(p-6);
			a[3]=x-(q[3]*(p-6));


			x = (a[0]<<3) + (a[0]<<1) + q[4]*i;
			q[4]=x/p;
			a[0]=x-(q[4]*p);

			x = (a[1]<<3) + (a[1]<<1) + q[4]*(i-1);
			q[4]=x/(p-2);
			a[1]=x-(q[4]*(p-2));

			x = (a[2]<<3) + (a[2]<<1) + q[4]*(i-2);
			q[4]=x/(p-4);
			a[2]=x-(q[4]*(p-4));


			x = (a[0]<<3) + (a[0]<<1) + q[5]*i;
			q[5]=x/p;
			a[0]=x-(q[5]*p);

			x = (a[1]<<3) + (a[1]<<1) + q[5]*(i-1);
			q[5]=x/(p-2);
			a[1]=x-(q[5]*(p-2));


			x = (a[0]<<3) + (a[0]<<1) + q[6]*i;
			q[6]=x/p;
			a[0]=x-(q[6]*p);


			a[7]=2;
			for (j=0;j<numdigits;j++){
				float qf[8], pf;
				unsigned int loopind;
				float decp;
				unsigned int xa[8];
				pf=(float)p;
				for (loopind=0;loopind<8;loopind++)
					xa[loopind] = (a[loopind]<<3) + (a[loopind]<<1) + q[j+7-loopind]*(i-loopind);
				for (loopind=0;loopind<8;loopind++)
					qf[loopind]=(float)xa[loopind];
				for (loopind=0,decp=0.0;loopind<8;decp+=2,loopind++){
					qf[loopind]/=(pf-decp);
					qf[loopind]+=(float)0.00000095367431640625;//1/(1024*1024)
				}
				for (loopind=0;loopind<8;loopind++)
					q[j+7-loopind]=(unsigned int)qf[loopind];
				for (loopind=0;loopind<8;loopind++)
					a[loopind]=xa[loopind]-(q[j+7-loopind]*(p-(loopind+loopind)));
			}
			x = (a[7]<<3) + (a[7]<<1) + q[j]*(i-7);
			q[j]=x/(p-14);
			a[7]=x-(q[j]*(p-14));



			x = (a[6]<<3) + (a[6]<<1) + q[j+1]*(i-6);
			q[j+1]=x/(p-12);
			a[6]=x-(q[j+1]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+1]*(i-7);
			q[j+1]=x/(p-14);
			a[7]=x-(q[j+1]*(p-14));



			x = (a[5]<<3) + (a[5]<<1) + q[j+2]*(i-5);
			q[j+2]=x/(p-10);
			a[5]=x-(q[j+2]*(p-10));

			x = (a[6]<<3) + (a[6]<<1) + q[j+2]*(i-6);
			q[j+2]=x/(p-12);
			a[6]=x-(q[j+2]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+2]*(i-7);
			q[j+2]=x/(p-14);
			a[7]=x-(q[j+2]*(p-14));



			x = (a[4]<<3) + (a[4]<<1) + q[j+3]*(i-4);
			q[j+3]=x/(p-8);
			a[4]=x-(q[j+3]*(p-8));

			x = (a[5]<<3) + (a[5]<<1) + q[j+3]*(i-5);
			q[j+3]=x/(p-10);
			a[5]=x-(q[j+3]*(p-10));

			x = (a[6]<<3) + (a[6]<<1) + q[j+3]*(i-6);
			q[j+3]=x/(p-12);
			a[6]=x-(q[j+3]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+3]*(i-7);
			q[j+3]=x/(p-14);
			a[7]=x-(q[j+3]*(p-14));



			x = (a[3]<<3) + (a[3]<<1) + q[j+4]*(i-3);
			q[j+4]=x/(p-6);
			a[3]=x-(q[j+4]*(p-6));

			x = (a[4]<<3) + (a[4]<<1) + q[j+4]*(i-4);
			q[j+4]=x/(p-8);
			a[4]=x-(q[j+4]*(p-8));

			x = (a[5]<<3) + (a[5]<<1) + q[j+4]*(i-5);
			q[j+4]=x/(p-10);
			a[5]=x-(q[j+4]*(p-10));

			x = (a[6]<<3) + (a[6]<<1) + q[j+4]*(i-6);
			q[j+4]=x/(p-12);
			a[6]=x-(q[j+4]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+4]*(i-7);
			q[j+4]=x/(p-14);
			a[7]=x-(q[j+4]*(p-14));



			x = (a[2]<<3) + (a[2]<<1) + q[j+5]*(i-2);
			q[j+5]=x/(p-4);
			a[2]=x-(q[j+5]*(p-4));

			x = (a[3]<<3) + (a[3]<<1) + q[j+5]*(i-3);
			q[j+5]=x/(p-6);
			a[3]=x-(q[j+5]*(p-6));

			x = (a[4]<<3) + (a[4]<<1) + q[j+5]*(i-4);
			q[j+5]=x/(p-8);
			a[4]=x-(q[j+5]*(p-8));

			x = (a[5]<<3) + (a[5]<<1) + q[j+5]*(i-5);
			q[j+5]=x/(p-10);
			a[5]=x-(q[j+5]*(p-10));

			x = (a[6]<<3) + (a[6]<<1) + q[j+5]*(i-6);
			q[j+5]=x/(p-12);
			a[6]=x-(q[j+5]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+5]*(i-7);
			q[j+5]=x/(p-14);
			a[7]=x-(q[j+5]*(p-14));



			x = (a[1]<<3) + (a[1]<<1) + q[j+6]*(i-1);
			q[j+6]=x/(p-2);
			a[1]=x-(q[j+6]*(p-2));

			x = (a[2]<<3) + (a[2]<<1) + q[j+6]*(i-2);
			q[j+6]=x/(p-4);
			a[2]=x-(q[j+6]*(p-4));

			x = (a[3]<<3) + (a[3]<<1) + q[j+6]*(i-3);
			q[j+6]=x/(p-6);
			a[3]=x-(q[j+6]*(p-6));

			x = (a[4]<<3) + (a[4]<<1) + q[j+6]*(i-4);
			q[j+6]=x/(p-8);
			a[4]=x-(q[j+6]*(p-8));

			x = (a[5]<<3) + (a[5]<<1) + q[j+6]*(i-5);
			q[j+6]=x/(p-10);
			a[5]=x-(q[j+6]*(p-10));

			x = (a[6]<<3) + (a[6]<<1) + q[j+6]*(i-6);
			q[j+6]=x/(p-12);
			a[6]=x-(q[j+6]*(p-12));

			x = (a[7]<<3) + (a[7]<<1) + q[j+6]*(i-7);
			q[j+6]=x/(p-14);
			a[7]=x-(q[j+6]*(p-14));


			p-=16;
		}
		numdigits+=7;
	}
	for (;i>1;i–){
		atmp=2;
		for (j=0;j<numdigits;j++){
			x = (atmp<<3) + (atmp<<1) + q[j]*i;
			q[j]=x/p;
			atmp=x-(q[j]*p);
		}
		p-=2;
	}
	atmp=2;
	for (j=0;j<numdigits;++j){
		x=((atmp<<3)+(atmp<<1))+q[j];
		qtmp = x / 10;
		atmp = x-((qtmp<<3)+(qtmp<<1));
        if (qtmp == 9)
            ++nines;
        else{
			if (qtmp == 10)
			{
				pi[piLength] = (unsigned char) (predigit + ‘1’);
				memset(&pi[piLength+1],’0′,nines);
				predigit = 0;
			}
			else
			{
				pi[piLength] = (unsigned char) (predigit + ‘0’);
				memset(&pi[piLength+1],’9′,nines);
				predigit = qtmp;
			}
			piLength += nines;
			nines = 1;
		}
	}

    pi[piLength] = (unsigned char)(predigit + ‘0’);
    pi[piLength+1] = ‘\n’;
	piLength+=2;


    if (argc > 2){
		fp = fopen(argv[2],”w”);
	}else{
        fp = fopen(“ic.txt”, “w”);
	}
    if (fp == NULL)
    {
       fprintf(stderr, “Cannot open ic.txt\n”);
       return 2;
    }
	fwrite(pi,sizeof(unsigned char),piLength,fp);
    fclose(fp);

    free(pi);

    return 0;
}

Pages: « Prev   1 2 3 4 5 6 7 8 9   Next »

Discuss (16 comments)