libtom
diff --git a/‎bn.pdf
1.35 KB b/‎bn.pdf
1.35 KB
diff --git a/‎bn.tex
+2-2 b/‎bn.tex
+2-2
diff --git a/‎bn_fast_mp_invmod.c
+22-51 b/‎bn_fast_mp_invmod.c
+22-51
diff --git a/‎bn_fast_mp_montgomery_reduce.c
+2-2 b/‎bn_fast_mp_montgomery_reduce.c
+2-2
diff --git a/‎bn_fast_s_mp_mul_digs.c
+20-18 b/‎bn_fast_s_mp_mul_digs.c
+20-18
diff --git a/‎bn_fast_s_mp_mul_high_digs.c
+12-4 b/‎bn_fast_s_mp_mul_high_digs.c
+12-4
diff --git a/‎bn_fast_s_mp_sqr.c
+9-9 b/‎bn_fast_s_mp_sqr.c
+9-9
@@ -1,7 +1,7 @@
-\documentclass[]{report}
+\documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.16 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
 
@@ -27,41 +27,18 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   int     res, neg;
 
   /* init all our temps */
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __B;
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
   }
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (a, &y)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
 
-  /* we need |y| */
-  if ((res = mp_abs (&y, &y)) != MP_OKAY) {
-    goto __D;
+  /* we need y = |a| */
+  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+    goto __ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! 
@@ -70,15 +47,15 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
    */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   mp_set (&D, 1);
 
@@ -87,57 +64,57 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_iseven (&B) == 0) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 5.2 if C,D are even then */
     if (mp_iseven (&D) == 0) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
   /* 6.  if u >= v then */
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -151,26 +128,20 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__D:mp_clear (&D);
-__B:mp_clear (&B);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
+__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
@@ -26,7 +26,7 @@ int
 fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
   int     ix, res, olduse;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* get old used count */
   olduse = a->used;
@@ -92,7 +92,7 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 
       /* inner loop */
       for (iy = 0; iy < m->used; iy++) {
-	*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
       }
     }
 
 
@@ -16,14 +16,16 @@
 
 /* Fast (comba) multiplier
  *
- * This is the fast column-array [comba] multiplier.  It is designed to compute
- * the columns of the product first then handle the carries afterwards.  This
- * has the effect of making the nested loops that compute the columns very
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
  * simple and schedulable on super-scalar processors.
  *
- * This has been modified to produce a variable number of digits of output so
- * if say only a half-product is required you don't have to compute the upper half
- * (a feature required for fast Barrett reduction).
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
@@ -32,7 +34,7 @@ int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* grow the destination as required */
   if (c->alloc < digs) {
@@ -47,10 +49,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   /* calculate the columns */
   pa = a->used;
   for (ix = 0; ix < pa; ix++) {
-
-    /* this multiplier has been modified to allow you to control how many digits 
-     * of output are produced.  So at most we want to make upto "digs" digits
-     * of output.
+    /* this multiplier has been modified to allow you to 
+     * control how many digits of output are produced.  
+     * So at most we want to make upto "digs" digits of output.
      *
      * this adds products to distinct columns (at ix+iy) of W
      * note that each step through the loop is not dependent on
@@ -73,14 +74,14 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
        */
       _W = W + ix;
 
-      /* the number of digits is limited by their placement.  E.g. 
+      /* the number of digits is limited by their placement.  E.g.
          we avoid multiplying digits that will end up above the # of
          digits of precision requested
        */
       pb = MIN (b->used, digs - ix);
 
       for (iy = 0; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
 
@@ -97,11 +98,12 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      * correct result we must take the extra bits from each column and
      * carry them down
      *
-     * Note that while this adds extra code to the multiplier it saves time
-     * since the carry propagation is removed from the above nested loop.
-     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-     * this is slower but on most cryptographic size numbers it is faster.
+     * Note that while this adds extra code to the multiplier it 
+     * saves time since the carry propagation is removed from the 
+     * above nested loop.This has the effect of reducing the work 
+     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+     * cost of the shifting.  On very small numbers this is slower 
+     * but on most cryptographic size numbers it is faster.
      */
     tmpc = c->dp;
     for (ix = 1; ix < digs; ix++) {
 
@@ -27,7 +27,7 @@ int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     oldused, newused, res, pa, pb, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* calculate size of product and allocate more space if required */
   newused = a->used + b->used + 1;
@@ -55,15 +55,23 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* alias for right side */
       tmpy = b->dp + iy;
-
+     
       /* alias for the columns of output.  Offset to be equal to or above the 
        * smallest digit place requested 
        */
-      _W = &(W[digs]);
+      _W = W + digs;     
+      
+      /* skip cases below zero where ix > digs */
+      if (iy < 0) {
+         iy    = abs(iy);
+         tmpy += iy;
+         _W   += iy;
+         iy    = 0;
+      }
 
       /* compute column products for digits above the minimum */
       for (; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
   }
 
@@ -20,7 +20,7 @@
  * then the carries are computed.  This has the effect of making a very simple
  * inner loop that is executed the most
  *
- * W2 represents the outer products and W the inner.  
+ * W2 represents the outer products and W the inner.
  *
  * A further optimizations is made because the inner products are of the form
  * "A * B * 2".  The *2 part does not need to be computed until the end which is
@@ -33,7 +33,7 @@ int
 fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
   int     olduse, newused, res, ix, pa;
-  mp_word W2[512], W[512];
+  mp_word W2[MP_WARRAY], W[MP_WARRAY];
 
   /* calculate size of product and allocate as required */
   pa = a->used;
@@ -44,9 +44,9 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
     }
   }
 
-  /* zero temp buffer (columns) 
+  /* zero temp buffer (columns)
    * Note that there are two buffers.  Since squaring requires
-   * a outter and inner product and the inner product requires 
+   * a outter and inner product and the inner product requires
    * computing a product and doubling it (a relatively expensive
    * op to perform n^2 times if you don't have to) the inner and
    * outer products are computed in different buffers.  This way
@@ -60,7 +60,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
  * values in W2 are only written in even locations which means
  * we can collapse the array to 256 words [and fixup the memset above]
  * provided we also fix up the summations below.  Ideally
- * the fixup loop should be unrolled twice to handle the even/odd 
+ * the fixup loop should be unrolled twice to handle the even/odd
  * cases, and then a final step to handle odd cases [e.g. newused == odd]
  *
  * This will not only save ~8*256 = 2KB of stack but lower the number of
@@ -71,10 +71,10 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
    * the multiplication by two is done afterwards in the N loop.
    */
   for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product 
+    /* compute the outer product
      *
-     * Note that every outer product is computed 
-     * for a particular column only once which means that 
+     * Note that every outer product is computed
+     * for a particular column only once which means that
      * there is no need todo a double precision addition
      */
     W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
@@ -95,7 +95,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 
       /* inner products */
       for (iy = ix + 1; iy < pa; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ int`
`26`	`26`	`fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)`
`27`	`27`	`{`
`28`	`28`	`int ix, res, olduse;`
`29`		`- mp_word W[512];`
	`29`	`+ mp_word W[MP_WARRAY];`
`30`	`30`
`31`	`31`	`/* get old used count */`
`32`	`32`	`olduse = a->used;`
`@@ -92,7 +92,7 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)`
`92`	`92`
`93`	`93`	`/* inner loop */`
`94`	`94`	`for (iy = 0; iy < m->used; iy++) {`
`95`		`- _W++ += ((mp_word) ui) ((mp_word) * tmpx++);`
	`95`	`+ _W++ += ((mp_word) ui) ((mp_word) * tmpx++);`
`96`	`96`	`}`
`97`	`97`	`}`
`98`	`98`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`* then the carries are computed. This has the effect of making a very simple`
`21`	`21`	`* inner loop that is executed the most`
`22`	`22`	`*`
`23`		`- * W2 represents the outer products and W the inner.`
	`23`	`+ * W2 represents the outer products and W the inner.`
`24`	`24`	`*`
`25`	`25`	`* A further optimizations is made because the inner products are of the form`
`26`	`26`	`* "A * B * 2". The *2 part does not need to be computed until the end which is`
`@@ -33,7 +33,7 @@ int`
`33`	`33`	`fast_s_mp_sqr (mp_int * a, mp_int * b)`
`34`	`34`	`{`
`35`	`35`	`int olduse, newused, res, ix, pa;`
`36`		`- mp_word W2[512], W[512];`
	`36`	`+ mp_word W2[MP_WARRAY], W[MP_WARRAY];`
`37`	`37`
`38`	`38`	`/* calculate size of product and allocate as required */`
`39`	`39`	`pa = a->used;`
`@@ -44,9 +44,9 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)`
`44`	`44`	`}`
`45`	`45`	`}`
`46`	`46`
`47`		`- /* zero temp buffer (columns)`
	`47`	`+ /* zero temp buffer (columns)`
`48`	`48`	`* Note that there are two buffers. Since squaring requires`
`49`		`- * a outter and inner product and the inner product requires`
	`49`	`+ * a outter and inner product and the inner product requires`
`50`	`50`	`* computing a product and doubling it (a relatively expensive`
`51`	`51`	`* op to perform n^2 times if you don't have to) the inner and`
`52`	`52`	`* outer products are computed in different buffers. This way`
`@@ -60,7 +60,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)`
`60`	`60`	`* values in W2 are only written in even locations which means`
`61`	`61`	`* we can collapse the array to 256 words [and fixup the memset above]`
`62`	`62`	`* provided we also fix up the summations below. Ideally`
`63`		`- * the fixup loop should be unrolled twice to handle the even/odd`
	`63`	`+ * the fixup loop should be unrolled twice to handle the even/odd`
`64`	`64`	`* cases, and then a final step to handle odd cases [e.g. newused == odd]`
`65`	`65`	`*`
`66`	`66`	`* This will not only save ~8*256 = 2KB of stack but lower the number of`
`@@ -71,10 +71,10 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)`
`71`	`71`	`* the multiplication by two is done afterwards in the N loop.`
`72`	`72`	`*/`
`73`	`73`	`for (ix = 0; ix < pa; ix++) {`
`74`		`- /* compute the outer product`
	`74`	`+ /* compute the outer product`
`75`	`75`	`*`
`76`		`- * Note that every outer product is computed`
`77`		`- * for a particular column only once which means that`
	`76`	`+ * Note that every outer product is computed`
	`77`	`+ * for a particular column only once which means that`
`78`	`78`	`* there is no need todo a double precision addition`
`79`	`79`	`*/`
`80`	`80`	`W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);`
`@@ -95,7 +95,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)`
`95`	`95`
`96`	`96`	`/* inner products */`
`97`	`97`	`for (iy = ix + 1; iy < pa; iy++) {`
`98`		`- _W++ += ((mp_word) tmpx) ((mp_word) * tmpy++);`
	`98`	`+ _W++ += ((mp_word) tmpx) ((mp_word) * tmpy++);`
`99`	`99`	`}`
`100`	`100`	`}`
`101`	`101`	`}`