How % operator works when we use negative values for operation? - operators

When I'm trying to execute -13%-10 statement in c , I'm getting -3 as output, I'm curious to know that why the output is not 3.

I did not know the answer to your question, either. So I expanded your original equation and inserted them into a short Netbeans/GCC C program:
/*
* File: main.c
* Author: Colleen
*
* Created on December 16, 2015, 9:43 AM
* Testing modulus operations with negative numbers.
*/
#include <windef.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
/*
*
*/
int main(int argc, char** argv) {
INT16 A = -13;
INT16 B = -10;
printf ("1. %3d %% %3d = %3d\n", A, B, A % B);
printf ("2. %3d %% %3d = %3d\n", -A, B, (-A)% B);
printf ("3. %3d %% %3d = %3d\n", A, -B, A%(-B));
printf ("4. %3d %% %3d = %3d\n", -A, -B, (-A)%(-B));
return (EXIT_SUCCESS);
}
Here were my results:
1. -13 % -10 = -3
2. 13 % -10 = 3
3. -13 % 10 = -3
4. 13 % 10 = 3
RUN SUCCESSFUL (total time: 35ms)
So it looks to me like the dividend (the number you are dividing, or "A" above) and the quotient (the answer, or "C" above) will always have the same sign.

Related

Inefficient kernel function

Is there any possibility to accelerate this simple kernel function? I have thought about using shared-memory but N is equal to 507904, so it is much more than shared memory array could be.
My program creates blocks of 256 threads each.
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
The simplest general optimisation would be something like this:
__global__ void compute(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x;)
{
COMPLEX_TYPE aval = a[i], bval = b[i]
FLOAT_TYPE Fval;
Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
[disclaimer: written in browser, not tested, use at own risk]
The idea here is to launch only as many threads as will execute concurrently on your target GPU, and then have every thread perform multiple operations rather than one. This helps amortise a lot of the fixed overhead at the block scheduler and setup code level and improve the overall efficiency. On most architectures, this will probably be memory bandwidth limited anyway, so memory coalescing and transaction optimisation is about the most important performance optimisation you will be able to make.
EDIT: Since this answer was marked CW, I elected to add my tests here, rather than create my own answer. If anyone objects to this, please just roll back the edit to a previous acceptable version. I'm not adding any new ideas, just testing those provided by #talonmies and #JanLucas
In my test case, the suggestions (excepting the unroll pragma) offered by #talonmies seem to give rise to a ~10% perf improvement. The suggestion by #JanLucas, to replace the floating-point divide with a floating point multiply, if acceptable, seem to give about a doubling of performance. This will obviously vary depending on GPU and other specifics. Here's my test:
$ cat t891.cu
#include <cuComplex.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 507904
#define nTPB 256
#define nBLK 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef cuFloatComplex COMPLEX_TYPE;
typedef float FLOAT_TYPE;
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
__global__ void compute_imp(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
__global__ void compute_imp2(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) * (f);
F[i] = Fval;
}
}
int main(){
COMPLEX_TYPE *d_A, *d_B;
FLOAT_TYPE *d_F, f = 4.0f;
cudaMalloc(&d_A, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_B, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_F, DSIZE*sizeof(FLOAT_TYPE));
//warm-up
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t1 = dtime_usec(0);
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t1 = dtime_usec(t1);
//warm-up
compute_imp<<<DSIZE/(8*nTPB),nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t2 = dtime_usec(0);
compute_imp<<<nBLK,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t2 = dtime_usec(t2);
//warm-up
compute_imp2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t3 = dtime_usec(0);
compute_imp2<<<nBLK,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
t3 = dtime_usec(t3);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)(USECPSEC), t3/(float)USECPSEC);
}
$ nvcc -O3 -o t891 t891.cu
$ ./t891
t1: 0.000226s, t2: 0.000209s, t3: 0.000110s
$
Notes:
The unroll pragma doesn't seem to help (it makes it run slower, for a few test cases I tried). The compiler already will, in some cases, unroll loops without a specific hint, and loop unrolling is generally an optimization that requires tuning, perhaps careful tuning.
The modification to the kernel proposed by #talonmies to create a grid-striding loop is one of the factors that would need to be taken into account to make a specific loop-unroll trip count useful. The overall grid dimension should be reduced by a factor equal to the unroll trip count, at least. However I wasn't able to find a "sweet spot".
I mostly tested on a Quadro5000 (Fermi cc2.0 GPU), CUDA 7.5RC, Fedora20. Certainly the behavior will be different on different GPUs, especially newer ones.
The nBLK parameter in this code is another "tunable" parameter, however I saw little variation with this when above about 64 or so. The best case might be to have a grid equal in size to the data.

Clang, link time optimization fails for AVX horizontal add

I have a small piece of testing code which calculates the dot products of two vectors with a third vector using AVX instructions (A dot C and B dot C below). It also adds the two products, but that is just to make the function return something for this example.
#include <iostream>
#include <immintrin.h>
double compute(const double *x)
{
__m256d A = _mm256_loadu_pd(x);
__m256d B = _mm256_loadu_pd(x + 4);
__m256d C = _mm256_loadu_pd(x + 8);
__m256d c1 = _mm256_mul_pd(A, C);
__m256d c2 = _mm256_mul_pd(B, C);
__m256d tmp = _mm256_hadd_pd(c1, c2);
__m128d lo = _mm256_extractf128_pd(tmp, 0);
__m128d hi = _mm256_extractf128_pd(tmp, 1);
__m128d dotp = _mm_add_pd(lo, hi);
double y[2];
_mm_store_pd(y, dotp);
return y[0] + y[1];
}
int main(int argc, char *argv[])
{
const double v[12] = {0.3, 2.9, 1.3, 4.0, -1.0, -2.1, -3.0, -4.0, 0.0, 2.0, 1.3, 1.2};
double x = 0;
std::cout << "AVX" << std::endl;
x = compute(v);
std::cout << "x = " << x << std::endl;
return 0;
}
When I compile as
clang++ -O3 -mavx main.cc -o main
everything works fine. If I enable link time optimization:
clang++ -flto -O3 -mavx main.cc -o main
I get the following error "LLVM ERROR: Do not know how to split the result of this operator!". I have narrowed the culprit to the _mm256_hadd_pd statement. If this is exchanged with e.g. _m256_add_pd link time optimization works again. I realize that this is a silly example to use link-time optimization for, but the error ocurred in a different context where it link-time optimization is extremely helpful.
Can anyone explain what is going on here?

Having trouble creating a weather converter program

I started school for computer programming just a couple weeks ago and we just started Objective-C! We need to convert Celsius to Fahrenheit and Kelvin. To do that I must input the amount of Celsius. Then I use this equation to get Fahrenheit: * 9 / 5 + 32. To get Kelvin I add 273.15.
#include <stdio.h>
int main(void)
{
float Celsius;
float Farenheight = Celsius * 9 / 5 + 32;
float Kelvin = Celsius + 273.15;
printf("How many degrees in Celsius?");
scanf("%s %s %d", Celsius, Farenheight, Kelvin);
printf("C: %s, F: %s, K: %d", Celsius, Farenheight, Kelvin);
}
This is (the second revision of) what I came up with so far, but I am really unsure on how to do this. If anyone can help me that would be great!
Funnily enough, temperature conversion came up in another context earlier today.
Adapting that code to your outline, you need to read the value in celsius before you convert anything to kelvin or fahrenheit (whereas your code converts an uninitialized value, which is not a good idea):
double celsius;
printf("What is the temperature in degrees Celsius? ");
if (scanf("%lf", &celsius) == 1)
{
double kelvin = celsius + 273.15;
double fahrenheit = (celsius + 40.0) * (9.0 / 5.0) - 40.0;
printf("%7.2f °C = %7.2f K = %7.2f °F\n", celsius, kelvin, fahrenheit);
}
Note that the input is checked for validity before the result is used.
The conversion formula is simpler than the usual one you see quoted, and is symmetric for converting °F to °C or vice versa, the difference being the conversion factor (9.0 / 5.0) vs (5.0 / 9.0). It relies on -40°C = -40°F. Try it:
C =  0°C; (C+40) = 40; (C+40)*9 = 360; (C+40)*9/5 = 72; (C+40)*9/5-40 = 32°F.
F = 32°F; (F+40) = 72; (F+40)*5 = 360; (F+40)*5/9 = 40; (F+40)*5/9-40 =  0°C.
Absolute zero is -273.15°C, 0K, -459.67°F.
Use this code snippet to read input from stdin:
#include <stdio.h>
int main (int argc, char *argv[]) {
int celsius;
printf("What is the temperature in celsius? ");
scanf("%d", &celsius);
printf("celsius degree = %d\n", celsius);
}

vecLib cblas_sgemm documentation wrong?

I'm trying to multiply two matrices using vecLibs' cblas:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <vecLib/cblas.h>
int main (void) {
float *A = malloc(sizeof(float) * 2 * 3);
float *B = malloc(sizeof(float) * 3 * 1);
float *C = malloc(sizeof(float) * 2 * 1);
cblas_sgemm(CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
2,
1,
3,
1.0,
A, 2,
B, 3,
0.0,
C, 2);
printf ("[ %f, %f]\n", C[0], C[1]);
return 0;
}
According to the docs every argument seems to match yet I get this error:
lda must be >= MAX(K,1): lda=2 K=3BLAS error: Parameter number 9 passed to cblas_sgemm had an invalid value
The error you are seeing seems perfectly correct to my eyes.
LDA is always the pitch of the array A in linear memory. If you are using row major storage order, the pitch will be the number of columns, not the number of rows. So LDA should be 3 in this case.

Code Golf: Automata

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
I made the ultimate laugh generator using these rules. Can you implement it in your favorite language in a clever manner?
Rules:
On every iteration, the following transformations occur.
H -> AH
A -> HA
AA -> HA
HH -> AH
AAH -> HA
HAA -> AH
n = 0 | H
n = 1 | AH
n = 2 | HAAH
n = 3 | AHAH
n = 4 | HAAHHAAH
n = 5 | AHAHHA
n = 6 | HAAHHAAHHA
n = 7 | AHAHHAAHHA
n = 8 | HAAHHAAHHAAHHA
n = 9 | AHAHHAAHAHHA
n = ...
Lex/Flex
69 characters. In the text here, I changed tabs to 8 spaces so it would look right, but all those consecutive spaces should be tabs, and the tabs are important, so it comes out to 69 characters.
#include <stdio.h>
%%
HAA|HH|H printf("AH");
AAH|AA|A printf("HA");
For what it's worth, the generated lex.yy.c is 42736 characters, but I don't think that really counts. I can (and soon will) write a pure-C version that will be much shorter and do the same thing, but I feel that should probably be a separate entry.
EDIT:
Here's a more legit Lex/Flex entry (302 characters):
char*c,*t;
#define s(a) t=c?realloc(c,strlen(c)+3):calloc(3,1);if(t)c=t,strcat(c,#a);
%%
free(c);c=NULL;
HAA|HH|H s(AH)
AAH|AA|A s(HA)
%%
int main(void){c=calloc(2,1);if(!c)return 1;*c='H';for(int n=0;n<10;n++)printf("n = %d | %s\n",n,c),yy_scan_string(c),yylex();return 0;}int yywrap(){return 1;}
This does multiple iterations (unlike the last one, which only did one iteration, and had to be manually seeded each time, but produced the correct results) and has the advantage of being extremely horrific-looking code. I use a function macro, the stringizing operator, and two global variables. If you want an even messier version that doesn't even check for malloc() failure, it looks like this (282 characters):
char*c,*t;
#define s(a) t=c?realloc(c,strlen(c)+3):calloc(3,1);c=t;strcat(c,#a);
%%
free(c);c=NULL;
HAA|HH|H s(AH)
AAH|AA|A s(HA)
%%
int main(void){c=calloc(2,1);*c='H';for(int n=0;n<10;n++)printf("n = %d | %s\n",n,c),yy_scan_string(c),yylex();return 0;}int yywrap(){return 1;}
An even worse version could be concocted where c is an array on the stack, and we just give it a MAX_BUFFER_SIZE of some sort, but I feel that's taking this too far.
...Just kidding. 207 characters if we take the "99 characters will always be enough" mindset:
char c[99]="H";
%%
c[0]=0;
HAA|HH|H strcat(c, "AH");
AAH|AA|A strcat(c, "HA");
%%
int main(void){for(int n=0;n<10;n++)printf("n = %d | %s\n",n,c),yy_scan_string(c),yylex();return 0;}int yywrap(){return 1;}
My preference is for the one that works best (i.e. the first one that can iterate until memory runs out and checks its errors), but this is code golf.
To compile the first one, type:
flex golf.l
gcc -ll lex.yy.c
(If you have lex instead of flex, just change flex to lex. They should be compatible.)
To compile the others, type:
flex golf.l
gcc -std=c99 lex.yy.c
Or else GCC will whine about ‘for’ loop initial declaration used outside C99 mode and other crap.
Pure C answer coming up.
MATLAB (v7.8.0):
73 characters (not including formatting characters used to make it look readable)
This script ("haha.m") assumes you have already defined the variable n:
s = 'H';
for i = 1:n,
s = regexprep(s,'(H)(H|AA)?|(A)(AH)?','${[137-$1 $1]}');
end
...and here's the one-line version:
s='H';for i=1:n,s = regexprep(s,'(H)(H|AA)?|(A)(AH)?','${[137-$1 $1]}');end
Test:
>> for n=0:10, haha; disp([num2str(n) ': ' s]); end
0: H
1: AH
2: HAAH
3: AHAH
4: HAAHHAAH
5: AHAHHA
6: HAAHHAAHHA
7: AHAHHAAHHA
8: HAAHHAAHHAAHHA
9: AHAHHAAHAHHA
10: HAAHHAAHHAHAAHHA
A simple translation to Haskell:
grammar = iterate step
where
step ('H':'A':'A':xs) = 'A':'H':step xs
step ('A':'A':'H':xs) = 'H':'A':step xs
step ('A':'A':xs) = 'H':'A':step xs
step ('H':'H':xs) = 'A':'H':step xs
step ('H':xs) = 'A':'H':step xs
step ('A':xs) = 'H':'A':step xs
step [] = []
And a shorter version (122 chars, optimized down to three derivation rules + base case):
grammar=iterate s where{i 'H'='A';i 'A'='H';s(n:'A':m:x)|n/=m=m:n:s x;s(n:m:x)|n==m=(i n):n:s x;s(n:x)=(i n):n:s x;s[]=[]}
And a translation to C++ (182 chars, only does one iteration, invoke with initial state on the command line):
#include<cstdio>
#define o putchar
int main(int,char**v){char*p=v[1];while(*p){p[1]==65&&~*p&p[2]?o(p[2]),o(*p),p+=3:*p==p[1]?o(137-*p++),o(*p++),p:(o(137-*p),o(*p++),p);}return 0;}
Javascript:
120 stripping whitespace and I'm leaving it alone now!
function f(n,s){s='H';while(n--){s=s.replace(/HAA|AAH|HH?|AA?/g,function(a){return a.match(/^H/)?'AH':'HA'});};return s}
Expanded:
function f(n,s)
{
s = 'H';
while (n--)
{
s = s.replace(/HAA|AAH|HH?|AA?/g, function(a) { return a.match(/^H/) ? 'AH' : 'HA' } );
};
return s
}
that replacer is expensive!
Here's a C# example, coming in at 321 bytes if I reduce whitespace to one space between each item.
Edit: In response to #Johannes Rössel comment, I removed generics from the solution to eek out a few more bytes.
Edit: Another change, got rid of all temporary variables.
public static String E(String i)
{
return new Regex("HAA|AAH|HH|AA|A|H").Replace(i,
m => (String)new Hashtable {
{ "H", "AH" },
{ "A", "HA" },
{ "AA", "HA" },
{ "HH", "AH" },
{ "AAH", "HA" },
{ "HAA", "AH" }
}[m.Value]);
}
The rewritten solution with less whitespace, that still compiles, is 158 characters:
return new Regex("HAA|AAH|HH|AA|A|H").Replace(i,m =>(String)new Hashtable{{"H","AH"},{"A","HA"},{"AA","HA"},{"HH","AH"},{"AAH","HA"},{"HAA","AH"}}[m.Value]);
For a complete source code solution for Visual Studio 2008, a subversion repository with the necessary code, including unit tests, is available below.
Repository is here, username and password are both 'guest', without the quotes.
Ruby
This code golf is not very well specified -- I assumed that function returning n-th iteration string is best way to solve it. It has 80 characters.
def f n
a='h'
n.times{a.gsub!(/(h(h|aa)?)|(a(ah?)?)/){$1.nil?? "ha":"ah"}}
a
end
Code printing out n first strings (71 characters):
a='h';n.times{puts a.gsub!(/(h(h|aa)?)|(a(ah?)?)/){$1.nil?? "ha":"ah"}}
Erlang
241 bytes and ready to run:
> erl -noshell -s g i -s init stop
AHAHHAAHAHHA
-module(g).
-export([i/0]).
c("HAA"++T)->"AH"++c(T);
c("AAH"++T)->"HA"++c(T);
c("HH"++T)->"AH"++c(T);
c("AA"++T)->"HA"++c(T);
c("A"++T)->"HA"++c(T);
c("H"++T)->"AH"++c(T);
c([])->[].
i(0,L)->L;
i(N,L)->i(N-1,c(L)).
i()->io:format(i(9,"H"))
Could probably be improved.
Perl 168 characters.
(not counting unnecessary newlines)
perl -E'
($s,%m)=qw[H H AH A HA AA HA HH AH AAH HA HAA AH];
sub p{say qq[n = $_[0] | $_[1]]};p(0,$s);
for(1..9){$s=~s/(H(AA|H)?|A(AH?)?)/$m{$1}/g;p($_,$s)}
say q[n = ...]'
De-obfuscated:
use strict;
use warnings;
use 5.010;
my $str = 'H';
my %map = (
H => 'AH',
A => 'HA',
AA => 'HA',
HH => 'AH',
AAH => 'HA',
HAA => 'AH'
);
sub prn{
my( $n, $str ) = #_;
say "n = $n | $str"
}
prn( 0, $str );
for my $i ( 1..9 ){
$str =~ s(
(
H(?:AA|H)? # HAA | HH | H
|
A(?:AH?)? # AAH | AA | A
)
){
$map{$1}
}xge;
prn( $i, $str );
}
say 'n = ...';
Perl 150 characters.
(not counting unnecessary newlines)
perl -E'
$s="H";
sub p{say qq[n = $_[0] | $_[1]]};p(0,$s);
for(1..9){$s=~s/(?|(H)(?:AA|H)?|(A)(?:AH?)?)/("H"eq$1?"A":"H").$1/eg;p($_,$s)}
say q[n = ...]'
De-obfuscated
#! /usr/bin/env perl
use strict;
use warnings;
use 5.010;
my $str = 'H';
sub prn{
my( $n, $str ) = #_;
say "n = $n | $str"
}
prn( 0, $str );
for my $i ( 1..9 ){
$str =~ s{(?|
(H)(?:AA|H)? # HAA | HH | H
|
(A)(?:AH?)? # AAH | AA | A
)}{
( 'H' eq $1 ?'A' :'H' ).$1
}egx;
prn( $i, $str );
}
say 'n = ...';
Python (150 bytes)
import re
N = 10
s = "H"
for n in range(N):
print "n = %d |"% n, s
s = re.sub("(HAA|HH|H)|AAH|AA|A", lambda m: m.group(1) and "AH" or "HA",s)
Output
n = 0 | H
n = 1 | AH
n = 2 | HAAH
n = 3 | AHAH
n = 4 | HAAHHAAH
n = 5 | AHAHHA
n = 6 | HAAHHAAHHA
n = 7 | AHAHHAAHHA
n = 8 | HAAHHAAHHAAHHA
n = 9 | AHAHHAAHAHHA
Here is a very simple C++ version:
#include <iostream>
#include <sstream>
using namespace std;
#define LINES 10
#define put(t) s << t; cout << t
#define r1(o,a,c0) \
if(c[0]==c0) {put(o); s.unget(); s.unget(); a; continue;}
#define r2(o,a,c0,c1) \
if(c[0]==c0 && c[1]==c1) {put(o); s.unget(); a; continue;}
#define r3(o,a,c0,c1,c2) \
if(c[0]==c0 && c[1]==c1 && c[2]==c2) {put(o); a; continue;}
int main() {
char c[3];
stringstream s;
put("H\n\n");
for(int i=2;i<LINES*2;) {
s.read(c,3);
r3("AH",,'H','A','A');
r3("HA",,'A','A','H');
r2("AH",,'H','H');
r2("HA",,'A','A');
r1("HA",,'A');
r1("AH",,'H');
r1("\n",i++,'\n');
}
}
It's not exactly code-golf (it could be made a lot shorter), but it works. Change LINES to however many lines you want printed (note: it will not work for 0). It will print output like this:
H
AH
HAAH
AHAH
HAAHHAAH
AHAHHA
HAAHHAAHHA
AHAHHAAHHA
HAAHHAAHHAAHHA
AHAHHAAHAHHA
ANSI C99
Coming in at a brutal 306 characters:
#include <stdio.h>
#include <string.h>
char s[99]="H",t[99]={0};int main(){for(int n=0;n<10;n++){int i=0,j=strlen(s);printf("n = %u | %s\n",n,s);strcpy(t,s);s[0]=0;for(;i<j;){if(t[i++]=='H'){t[i]=='H'?i++:t[i+1]=='A'?i+=2:1;strcat(s,"AH");}else{t[i]=='A'?i+=1+(t[i+1]=='H'):1;strcat(s,"HA");}}}return 0;}
There are too many nested ifs and conditional operators for me to effectively reduce this with macros. Believe me, I tried. Readable version:
#include <stdio.h>
#include <string.h>
char s[99] = "H", t[99] = {0};
int main()
{
for(int n = 0; n < 10; n++)
{
int i = 0, j = strlen(s);
printf("n = %u | %s\n", n, s);
strcpy(t, s);
s[0] = 0;
/*
* This was originally just a while() loop.
* I tried to make it shorter by making it a for() loop.
* I failed.
* I kept the for() loop because it looked uglier than a while() loop.
* This is code golf.
*/
for(;i<j;)
{
if(t[i++] == 'H' )
{
// t[i] == 'H' ? i++ : t[i+1] == 'A' ? i+=2 : 1;
// Oh, ternary ?:, how do I love thee?
if(t[i] == 'H')
i++;
else if(t[i+1] == 'A')
i+= 2;
strcat(s, "AH");
}
else
{
// t[i] == 'A' ? i += 1 + (t[i + 1] == 'H') : 1;
if(t[i] == 'A')
if(t[++i] == 'H')
i++;
strcat(s, "HA");
}
}
}
return 0;
}
I may be able to make a shorter version with strncmp() in the future, but who knows? We'll see what happens.
In python:
def l(s):
H=['HAA','HH','H','AAH','AA','A']
L=['AH']*3+['HA']*3
for i in [3,2,1]:
if s[:i] in H: return L[H.index(s[:i])]+l(s[i:])
return s
def a(n,s='H'):
return s*(n<1)or a(n-1,l(s))
for i in xrange(0,10):
print '%d: %s'%(i,a(i))
First attempt: 198 char of code, I'm sure it can get smaller :D
REBOL, 150 characters. Unfortunately REBOL is not a language conducive to code golf, but 150 characters ain't too shabby, as Adam Sandler says.
This assumes the loop variable m has already been defined.
s: "H" r: "" z:[some[["HAA"|"HH"|"H"](append r "AH")|["AAH"|"AA"|"A"](append r "HA")]to end]repeat n m[clear r parse s z print["n =" n "|" s: copy r]]
And here it is with better layout:
s: "H"
r: ""
z: [
some [
[ "HAA" | "HH" | "H" ] (append r "AH")
| [ "AAH" | "AA" | "A" ] (append r "HA")
]
to end
]
repeat n m [
clear r
parse s z
print ["n =" n "|" s: copy r]
]
F#: 184 chars
Seems to map pretty cleanly to F#:
type grammar = H | A
let rec laugh = function
| 0,l -> l
| n,l ->
let rec loop = function
|H::A::A::x|H::H::x|H::x->A::H::loop x
|A::A::H::x|A::A::x|A::x->H::A::loop x
|x->x
laugh(n-1,loop l)
Here's a run in fsi:
> [for a in 0 .. 9 -> a, laugh(a, [H])] |> Seq.iter (fun (a, b) -> printfn "n = %i: %A" a b);;
n = 0: [H]
n = 1: [A; H]
n = 2: [H; A; A; H]
n = 3: [A; H; A; H]
n = 4: [H; A; A; H; H; A; A; H]
n = 5: [A; H; A; H; H; A]
n = 6: [H; A; A; H; H; A; A; H; H; A]
n = 7: [A; H; A; H; H; A; A; H; H; A]
n = 8: [H; A; A; H; H; A; A; H; H; A; A; H; H; A]
n = 9: [A; H; A; H; H; A; A; H; A; H; H; A]