/*
	Portable TI99 tape encoder/decoder

	tape_dec.c: Code to decode contents of an existing tape image

	Raphael Nabet, 2002
*/

#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <string.h>

#include "common.h"
#include "my_bool.h"
#include "my_int.h"

#include "tifiles.h"
#include "wave.h"
#include "tape_dec.h"

typedef struct ti_header_t
{
	int block_len;
} ti_header_t;

typedef struct ti_block_t
{
	enum { no_bit_slip, possible_bit_slip, probable_bit_slip } bit_slip;
	unsigned char data[64];
	unsigned char data_checksum;
	unsigned char expected_checksum;
} ti_block_t;

enum
{
	min_header_leader_len = 56,
	min_block_leader_len = 32
};

#define MAX_BITS_MANY INT_MAX

static error_t read_tape_header(wave_file_in_t *src, ti_header_t *ti_header);
static error_t read_tape_body(wave_file_in_t *src, tifile_out_t *dst, int len_in_blocks);
static error_t read_tape_block(wave_file_in_t *src, bool last_block, ti_block_t *buf);
static error_t read_tape_byte(wave_file_in_t *src, bool last_byte, int *reply);
static error_t read_tape_bit(wave_file_in_t *src, int max_bits, int *reply);

static int cur_state;	/* 1 if tape output is assumed to be currently >= 0 */


/*
Theory of decoding (DECODING_ALGORITHM 0):

	Bit reading

	Here is a 0 (assuming cassette line is initially low):
	     _____________________
	    |                     |
	  __|                     |__________
	    ^
	flux change
	Here is a 1 (assuming cassette line is initially low):
	     __________            __________
	    |          |          |
	  __|          |__________|
	    ^
	flux change

	To detect them, we detect the first flux change, then convoluate with the following:
	        ____       ____
	    ___|    |_____|    |_	(B0)

	    ^w1| w2 | w3  | w4 
	    |
	tape's flux change
	        ____
	    ___|    |_____      _	(B1)
	                  |____|
	    ^w1| w2 | w3  | w4 |
	    |
	tape's flux change

	If the product with B0 is greater than the product with B1, the bit is 0,
	otherwise it is 1.

	Initial flux change detection is probably the weakest spot in my routine.
	I have been told that convolution can be used to detect the flux change,
	but none of the various attempts I made a few years ago worked.
	Looking back at this failure, I think we should consider several
	consecutive bits in order to get valid results.  (cf DECODING_ALGORITHM 2
	which uses pure convolution to detect flux changes.)

	Additionnally, the convolution functions B0 and B1 are questionnable,
	because they arbitrarily discard sample around the flux change.  While this
	enables the algorithm to work in spite of the incertitude on the exact
	location of the various flux changes, it trashes valuable samples which
	should probably be taken into account for an algorithm to be optimal.


Theory of decoding (DECODING_ALGORITHM 2):

	The real idea is finding a square function f_(T_i)(t) whose signed product
	with the signal is maximal with respect to the T_i:

	   _____________________            _____________________
	  |                     |          |
	__|                     |__________|
	 T_0                   T_1        T_2

	With the following constraint on the T_i:
	T_(i-1)+2*DRATE-e<T_i<T_(i-1)+2*DRATE+e (data is 0)
	OR:
	T_(i-1)+DRATE-e<T_i<T_(i-1)+DRATE+e AND T_(i)+DRATE-e<T_(i+1)<T_i+DRATE+e (data is 1)

	Of course, the problem has too many variables (one or twice the number of
	bits recorded to tape) to be solved completely.  Analytical approches would
	probably not help much (it would need a lot of work to get analytical
	functions that work, and even so we would need to have a good approximation
	of the final result beforehand for analytical optimization to work).
	Instead, we use a local approach, as follow:

	* We assume T_0 = 0.
	* We pose k=0, and repeat the following sequence (until we either reach
	  the end of tape data, or the end of the source wave file):
		* We minimize the product on a fixed-width window (width w_tot)
		  starting at T_k.  We retain the first bit of data found by the
		  minimization process.  If this bit is a 1, we keep the values for T_k
		  and T_k+1; if it is a 0, we keep the value for T_k only.
		* If the bit found is a 0, we repeat the process for k=k+1;
		  If it is a 1, we repeat the process for k=k+2.


Theory of decoding (DECODING_ALGORITHM 1):

	This algorithm is a weaker variant of DECODING_ALGORITHM 2.

	The function here is of the type:
		 _______________________________
		|                               |________		(0)
		            w1      w2  w3      T       w4
		 _______________                 ________
		|               |_______________|				(1)
		            w1  T'  w2  w3      T       w4

	With the following constraints:
		w3<T<w4
		w1<T'<w2


	Known problem: we may find a shape like this for a 1:
	     ___________________     _______________...
	 ...|                   |___|
		            w1      w2  w3              w4
	Obviously, this is no good: the shape is too distorted.


Ideas for improvements:

	Maybe we should apply a band-pass filter to reduce noise. On the one hand,
	it would be nice to filter out low-frequency noise (50Hz or 60Hz noise from
	the electric lines comes to mind) and high-frequency noise.  On the other
	hand, maybe we should estimate how square the leader is: if the signal is
	deformed, we could try to compute a filter which improves the signal.

	Another unaddressed issue is wow robustness.  Bit to bit wow cannot exceed
	1/3 theorically (and I have found that a value of 20% already degrades
	white noise robustness with DECODING_ALGORITHM 2), because, otherwise,
	we might confuse 2 consecutive 1s with a 0.  However, the program could
	address higher wow values provided the frequency drift of the carrier is
	progressive, i.e. the carrier frequency for one bit remains within 20% of
	the average carrier frequency of the N previous bit.
*/

/* default data rate.  The algorithm cannot detect data rate changes yet. */
static const double default_data_rate = 1379.;
/*static double data_rate;*/

#define DECODING_ALGORITHM 2	/* 0: detection of first flux change and convolution with fixed-size window (fastest) */
								/* 1: pure convolution, single-level */
								/* 2: pure recursive (multi-level) convolution (should have sound mathematical fundations, but slow, and problems with last bits in stream) */

#if (DECODING_ALGORITHM == 0)

static int bit_timeout;
static int w1;
static int w2;
static int w3;
static int w4;

static void compute_parameters(double sample_rate)
{
	double bit_width;

	bit_width = sample_rate / default_data_rate;
	w2 = w4 = ceil(bit_width/4);
	w1 = /*floor(bit_width/8)*/bit_width/8 +.5;
	w3 = /*floor(bit_width/4)*/bit_width/4 +.5;

	bit_timeout = bit_width/2+.5;
}

#elif (DECODING_ALGORITHM == 1)

static int w1;
static int w2;
static int w3;
static int w4;

static void compute_parameters(double sample_rate)
{
	double bit_width;

	bit_width = sample_rate / default_data_rate;
	w1 = bit_width*.375 +.5;
	w2 = bit_width*.625 +.5;
	w3 = bit_width*.75 +.5;
	w4 = bit_width*1.25 +.5;
}

#else

/* this algorithm is similar (and yet different) to DECODING_ALGORITHM 1 when
max_wow = .25 and w_tot = 2*w_max */

/* maximal value for wow, should be no be greater than 1/3, or we might confuse
2 consecutive 0s with a 1, and vice-versa */
/* values greater than .2 degrade robustness greatly */
static const double max_wow = /*.25*/.15;

static int w_min;
static int w_max;
static int w_tot;

static void compute_parameters(double sample_rate)
{
	double bit_width;

	bit_width = sample_rate / default_data_rate;
	w_min = bit_width*.5*(1-max_wow) +.5;
	w_max = bit_width*.5*(1+max_wow) +.5;
	/* The performances of the algorithm improve greatly when using a wider window, but at
	a huge cost in performance */
	w_tot = bit_width*2. +.5;
	/*w_tot = 2*w_max;*/		/* minimal value */
}

#endif

/*
	Read a complete tape

	ti_file_type: 0 for memory dump, 1 for TIBASIC multirecord file
	ti_record_len: from 1 to 192, 0 for auto (ignored if (file_type == 0))
*/
error_t read_tape(wave_file_in_t *src, tifile_out_t *dst/*FILE *dst, ftype_t file_type, int record_len*/)
{
	error_t error;
	ti_header_t ti_header;
	int i;
	int record_len_in_blocks;
	/*error_t cumulative_error = no_error;*/


	compute_parameters(src->samplesPerSec);

	if (dst->type == ftype_data)
	{
		if ((dst->reclen < 0) || (dst->reclen > 192))
			return invalid_parameters;

		for (i=0; 1; i++)
		{
			#if DISPLAY_PROGRESS
				fprintf(stderr, "decoding record %d\n", i);
			#endif

			if ((error = read_tape_header(src, & ti_header)) != no_error)
			{
				if ((i != 0) && ((error == time_out) || (error == eof_error)))
				{
					/* we must have reached EOF */
					return no_error;
				}
				else
					/* could not find any data */
					return error;
			}

			if (i == 0)
			{
				if (dst->reclen == 0)
				{	/* guess record len */
					dst->reclen = ti_header.block_len*64;	/* one block is 64 bytes */
					if (dst->reclen > 192)
						return invalid_multi_record_tape;	/* this cannot be a multi-record file */
				}

				/* how many 64-byte blocks per record? */
				record_len_in_blocks = (dst->reclen+63) / 64;
			}

			if (ti_header.block_len != record_len_in_blocks)
			{
				if (i == 0)
					return invalid_record_len;
				else
					return invalid_multi_record_tape;	/* this, or we have reached EOF before the tape terminated */
			}

			error = read_tape_body(src, dst, ti_header.block_len);
			if (error)
				return error;
		}
	}
	else
	{
		/*record_len = 64;*/	/* this is the usual thing */

		if ((error = read_tape_header(src, & ti_header)) != no_error)
			return error;

		error = read_tape_body(src, dst, ti_header.block_len);
	}


	return error;
}

/*
	Read the tape header (record header for multirecord tapes)
*/
static error_t read_tape_header(wave_file_in_t *src, ti_header_t *ti_header)
{
	error_t error;
	int bit, byte;
	int i;
	int leader_len;	/* lenght of leader in bits */
	/*int retry_count;*/
	sample_pos_t leader_sample_start;
	sample_pos_t header_sample_len;	/* lenght of header in samples */

	cur_state = 0;
retry:
	/* run through leader until we find a 1 */
	leader_len = 0;
	do
	{
		error = read_tape_bit(src, MAX_BITS_MANY, & bit);
		if (error)
		{
			if (error == time_out)
				goto retry;
			else
				return error;
		}

		if (leader_len == 4)
			/* ignore first 4 bits, as synchronisation is likely to be not perfect with first samples */
			leader_sample_start = tell_wave_sample(src);

		if (bit == 0)
			leader_len++;
	}
	while (bit == 0);

	/* check that the leader had at least min_header_leader_len consecutive bits of 0s */
	if (leader_len < min_header_leader_len)
		goto retry;

	/* found one bit to 1, look for 7 more */
	for (i=0; i<7; i++)
	{
		error = read_tape_bit(src, MAX_BITS_MANY, & bit);
		if (error)
		{
			if (error == time_out)
				goto retry;
			else
				return error;
		}
		if (bit == 0)
			goto retry;
	}

	/* read len in blocks */
	error = read_tape_byte(src, false, & byte);
	if (error)
	{
		if (error == time_out)
			goto retry;
		else
			return error;
	}
	ti_header->block_len = byte;
	/* read second len */
	error = read_tape_byte(src, false, & byte);
	if (error)
	{
		if (error == time_out)
			goto retry;
		else
			return error;
	}
	if (ti_header->block_len != byte)
		goto retry;		/* why not? */

	if (! ti_header->block_len)
		ti_header->block_len = 256;	/* TI99/4A tape write routine does support this */

	/* compute len */
	header_sample_len = tell_wave_sample(src) - leader_sample_start;


	{
		double rate = (leader_len-4+24)/((double)header_sample_len / src->samplesPerSec);
		fprintf(stderr, "data rate %f\n", rate);
	}

	return no_error;
}

/*
	Read the tape data (record data for multirecord tapes)
*/
static error_t read_tape_body(wave_file_in_t *src, tifile_out_t *dst, int len_in_blocks)
{
	int i;

	ti_block_t buf[2*256];
	error_t errors[2*256];
	int nb_block_read;
	error_t cumulative_error = no_error;
	unsigned char record_buf[192];


	/* clear bit slip array */
	/*for (i=0; i<len_in_blocks*2; i++)
	{
		buf[i].bit_slip = possible_bit_slip;
	}*/

	/* read all data blocks */
	for (i=0; i<len_in_blocks*2; i++)
	{
		#if DISPLAY_PROGRESS
			if (! (i & 1))
				fprintf(stderr, "decoding block %d\n", i>>1);
		#endif
		errors[i] = read_tape_block(src, i == (len_in_blocks*2-1), & buf[i]);
		if (errors[i] != no_error)
		{
			/* We could ignore time out, but we would probably be out of sync and
			the decoding would yield faulty results */
			/*if (errors[i] == time_out)
			{
				if (cumulative_error == no_error)
					cumulative_error = errors[i];
			}
			else*/
			{
				cumulative_error = errors[i];
				fprintf(stderr, "Could only read %d blocks out of %d: the tape is damaged and cannot be read until the end\n", (i+1)>>1, len_in_blocks);
				break;
			}
		}
	}
	/* remember the number of blocks actually read */
	nb_block_read = i;

	/* try to match and fix each pair of data blocks we have read */
	for (i=0; i<(nb_block_read+1)>>1; i++)
	{
		int i0 = i*2, i1 = i*2+1;
		int i_good;
		int block_match;


		/* compute confidence, based on bit slip detection */
		/* bit slip is detected when reading the leader of next block */
		int bit_slip_0 = (i0+1<nb_block_read) ? buf[i0+1].bit_slip : possible_bit_slip;
		int bit_slip_1 = (i1+1<nb_block_read) ? buf[i1+1].bit_slip : possible_bit_slip;

		/* Confidence constants */
		enum
		{
			conf_init = 0,	/* default value for confidence */

			conf_no_bit_slip = 2,		/* offset if it appears there is no bit slip */
			conf_possible_bit_slip = 1,	/* offset if we cannot tell whether there is a bit slip */
			conf_probable_bit_slip = 0,	/* offset if it appears there is a bit slip */

			conf_checksum_no_bit_slip = 3,		/* offset if the data match a checksum with no appearant bit slip */
			conf_checksum_possible_bit_slip = 2,/* offset if the data match a checksum, but we cannot tell whether this checksum has been bit-slipped or not */
			conf_checksum_probable_bit_slip = 1,/* offset if the data match a checksum, but this checksum seems to have been bit-slipped */
			conf_no_checksum = 0,

			conf_none = -1			/* an error occured when reading the block */
		};

		int conf_0, conf_1;

		if (errors[i0])
			conf_0 = conf_none;
		else
		{
			conf_0 = conf_init;

			switch (bit_slip_0)
			{
			case no_bit_slip:
				conf_0 += conf_no_bit_slip;
				break;
			case possible_bit_slip:
				conf_0 += conf_possible_bit_slip;
				break;
			case probable_bit_slip:
				conf_0 += conf_probable_bit_slip;
				break;
			}

			if (((bit_slip_0 == no_bit_slip) && (buf[i0].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == no_bit_slip) && (buf[i0].data_checksum == buf[i1].expected_checksum)))
				conf_0 += conf_checksum_no_bit_slip;
			else if (((bit_slip_0 == possible_bit_slip) && (buf[i0].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == possible_bit_slip) && (buf[i0].data_checksum == buf[i1].expected_checksum)))
				conf_0 += conf_checksum_possible_bit_slip;
			else if (((bit_slip_0 == probable_bit_slip) && (buf[i0].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == probable_bit_slip) && (buf[i0].data_checksum == buf[i1].expected_checksum)))
				conf_0 += conf_checksum_probable_bit_slip;
			else
				conf_0 += conf_no_checksum;
		}

		if (errors[i1])
			conf_1 = conf_none;
		else
		{
			conf_1 = conf_init;

			switch (bit_slip_1)
			{
			case no_bit_slip:
				conf_1 += conf_no_bit_slip;
				break;
			case possible_bit_slip:
				conf_1 += conf_possible_bit_slip;
				break;
			case probable_bit_slip:
				conf_1 += conf_probable_bit_slip;
				break;
			}

			if (((bit_slip_0 == no_bit_slip) && (buf[i1].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == no_bit_slip) && (buf[i1].data_checksum == buf[i1].expected_checksum)))
				conf_1 += conf_checksum_no_bit_slip;
			else if (((bit_slip_0 == possible_bit_slip) && (buf[i1].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == possible_bit_slip) && (buf[i1].data_checksum == buf[i1].expected_checksum)))
				conf_1 += conf_checksum_possible_bit_slip;
			else if (((bit_slip_0 == probable_bit_slip) && (buf[i1].data_checksum == buf[i0].expected_checksum))
					|| ((bit_slip_1 == probable_bit_slip) && (buf[i1].data_checksum == buf[i1].expected_checksum)))
				conf_1 += conf_checksum_probable_bit_slip;
			else
				conf_1 += conf_no_checksum;
		}


		fprintf(stderr, "saving block %d\n", i);

		if (errors[i0] && errors[i1])
		{	/* error in both blocks */
			fprintf(stderr, "I can't read block %d\n", i);

			i_good = 0;
		}
		else if (errors[i0] || errors[i1])
		{	/* error in either block */
			fprintf(stderr, "I can only read one copy of block %d\n", i);
			/* i_good is index of correct block */
			i_good = errors[i0] ? i1 : i0;
			if (buf[i_good].data_checksum == buf[i_good].expected_checksum)
				/* checksum of data does not match expected checksum */
				fprintf(stderr, "A probable read error was detected.\n");
		}
		else
		{	/* no error */
			block_match = ! memcmp(buf[i0].data, buf[i1].data, 64);
			if (block_match)
			{	/* data blocks are identical, therefore they are likely to be correct */
				i_good = i0;	/* does not make a difference whether it is i0 or i1*/

				if (buf[i0].expected_checksum == buf[i1].expected_checksum)
				{	/* checksums on tape are identical, as they should */
					if (buf[i0].data_checksum != buf[i0].expected_checksum)
						/* checksum of data does not match expected checksum */
						fprintf(stderr, "A possible read error was detected.\n");
				}
				else /* (expected_checksum[0] != expected_checksum[1]) */
				{	/* checksums on tape are different */
					if ((buf[i0].data_checksum == buf[i0].expected_checksum) || (buf[i0].data_checksum == buf[i1].expected_checksum))
					{
						/* data matches one of the checksums */
						fprintf(stderr, "A read error was detected and was corrected (hopefully).\n");
					}
					else /*((actual_checksum[0] != expected_checksum[0]) && (actual_checksum[1] != expected_checksum[1]))*/
					{
						/* data matches neither checksum */
						fprintf(stderr, "A possible read error was detected.\n");
					}
				}
			}
			else
			{	/* data blocks are different, hopefully one of them is correct */
				if (buf[i0].data_checksum != buf[i1].data_checksum)
				{	/* Good, we can make the difference */
					if (buf[i0].expected_checksum == buf[i1].expected_checksum)
					{	/* checksums on tape are identical, as they should */
						if ((buf[i0].data_checksum == buf[i0].expected_checksum) || (buf[i1].data_checksum == buf[i0].expected_checksum))
						{	/* one of the data blocks matches the checksum */
							fprintf(stderr, "A read error was detected and was corrected (hopefully).\n");

#if 0
							i_good = (buf[i0].data_checksum == buf[i0].expected_checksum) ? i0 : i1;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
						else
						{	/* neither data block match the checksum */

							/* try some error correction??? */

							fprintf(stderr, "A probable read error was detected.\n");

#if 0
							i_good = i0;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
					}
					else
					{	/* checksums on tape are different */
						if ((buf[i0].data_checksum == buf[i0].expected_checksum) && (buf[i1].data_checksum == buf[i1].expected_checksum))
						{	/* Both block taken separately seems to be correct.  Maybe we have slipped? */

							/* ... */

							fprintf(stderr, "A probable read error was detected.\n");

#if 0
							i_good = i0;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
						else if ((buf[i0].data_checksum == buf[i1].expected_checksum) && (buf[i1].data_checksum == buf[i0].expected_checksum))
						{	/* Oh, no! */

							/* try some error correction??? */

							fprintf(stderr, "A probable read error was detected.\n");

#if 0
							i_good = i0;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
						else if ((buf[i0].data_checksum == buf[i0].expected_checksum) || (buf[i0].data_checksum == buf[i1].expected_checksum))
						{	/* Block 1 looks correct */
							fprintf(stderr, "A read error was detected and was corrected (hopefully).\n");
#if 1
							i_good = i0;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
						else if ((buf[i1].data_checksum == buf[i0].expected_checksum) || (buf[i1].data_checksum == buf[i1].expected_checksum))
						{	/* Block 2 looks correct */
							fprintf(stderr, "A read error was detected and was corrected (hopefully).\n");
#if 1
							i_good = i1;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
						else
						{	/* no match at all */

							/* try some error correction??? */

							fprintf(stderr, "A probable read error was detected.\n");

#if 0
							i_good = i0;
#else
							i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
						}
					}
				}
				else /*(buf[i0].data_checksum == buf[i1].data_checksum)*/
				{	/* great, there is no way checksum can help us */

					/* error correction impossible??? */

					fprintf(stderr, "A probable read error was detected.\n");

#if 0
					i_good = i0;
#else
					i_good = (conf_0 >= conf_1) ? i0 : i1;
#endif
				}
			}
		}

		if (dst->type == ftype_data)
		{
			if (i<3)	/* extra check, just be safe */
				memcpy(record_buf+i*64, buf[i_good].data, 64);
		}
		else
			tifile_write(buf[i_good].data, dst);
	}

	if ((nb_block_read+1)>>1 < len_in_blocks)
	{
		if (dst->type == ftype_data)
		{
			if ((nb_block_read+1)>>1 < 3)	/* extra check, just be safe */
				memset(record_buf+((nb_block_read+1)>>1)*64, 0,
						(len_in_blocks < 3) ? (len_in_blocks-((nb_block_read+1)>>1))*64
												: (3-((nb_block_read+1)>>1))*64);
		}
		else
		{
			memset(record_buf, 0, 64);
			for (i=(nb_block_read+1)>>1; i<len_in_blocks; i++)
				tifile_write(record_buf, dst);
		}
	}

	if (dst->type == ftype_data)
		tifile_write(record_buf, dst);

	return cumulative_error;
}

/*
	Read one block of tape
*/
static error_t read_tape_block(wave_file_in_t *src, bool last_block, ti_block_t *buf)
{
	error_t error;
	int bit, byte;
	int i;
	int leader_len;	/* lenght of leader in bits */
	int total_len;	/* total number of bits we have read */
	/*int retry_count;*/

	buf->bit_slip = possible_bit_slip;
	total_len = 0;

retry:
	/* run through leader until we find a 1 */
	leader_len = 0;
	do
	{
		error = read_tape_bit(src, MAX_BITS_MANY, & bit);
		if (error)
		{
			if (error == time_out)
				goto retry;
			else
				return error;
		}
		if (bit == 0)
			leader_len++;
		total_len++;
	}
	while (bit == 0);

	/* check that the leader had at least min_block_leader_len consecutive bits of 0s */
	if (leader_len < min_block_leader_len)
		goto retry;

	/* found one bit to 1, look for 7 more */
	for (i=0; i<7; i++)
	{
		error = read_tape_bit(src, MAX_BITS_MANY, & bit);
		if (error)
		{
			if (error == time_out)
				goto retry;
			else
				return error;
		}
		total_len++;
		if (bit == 0)
			goto retry;
	}

	/* try to to check that we did not slip when reading previous block */
	buf->bit_slip = (total_len == 64+8) ? no_bit_slip : probable_bit_slip;

	/* read block data */
	buf->data_checksum = 0;
	/* todo: improve this */
	for (i=0; i<64; i++)
	{
		error = read_tape_byte(src, false, & byte);
		if (error)
		{
			if (error == time_out)
				goto retry;
			else
				return error;
		}
		buf->data[i] = byte;
		buf->data_checksum = (buf->data_checksum + byte) & 0xff;
	}


	/* read checksum */
	error = read_tape_byte(src, last_block, & byte);
	if (error)
	{
		if (error == time_out)
			goto retry;
		else
			return error;
	}
	buf->expected_checksum = byte;

	return no_error;
}

/*
	Read one byte in tape
*/
static error_t read_tape_byte(wave_file_in_t *src, bool last_byte, int *reply)
{
	int i;
	error_t error;
	int byte, bit;

	byte = 0;
	for (i=0; i<8; i++)
	{
		if ((error = read_tape_bit(src, last_byte ? 8-i : MAX_BITS_MANY, & bit)) != no_error)
		{
			return error;
		}
		byte = (byte << 1) | bit;
	}

	*reply = byte;

	return no_error;
}

#if (DECODING_ALGORITHM == 0)
/*
	Read one bit in tape (old variant, works fine)
*/

/*
	Find first flux change
*/
static error_t find_next_flux_change(wave_file_in_t *src, float *first_sample)
{
	error_t error;
	float buf;
	int timeout_ctr = bit_timeout;


	while (((error = read_wave_sample(src, & buf)) == no_error) && ((buf >= 0.) == cur_state))
		if ((timeout_ctr--) <= 0)
			return time_out;
	*first_sample = buf;

	if (! error)
		cur_state = ! cur_state;

	return error;
}

static error_t read_tape_bit(wave_file_in_t *src, int *reply)
{
	error_t error;
	float buf;
	int i;
	float sum2, sum4, rating_0, rating_1;


	if ((error = find_next_flux_change(src, & buf)) != no_error)
		return error;

	for (i=0; i<w1; i++)
	{
		if ((error = read_wave_sample(src, & buf)) != no_error)
			return error;
	}

	sum2 = buf;

	for (i=0; i<w2-1; i++)
	{
		if ((error = read_wave_sample(src, & buf)) != no_error)
			return error;
		sum2 += buf;
	}

	for (i=0; i<w3; i++)
	{
		if ((error = read_wave_sample(src, & buf)) != no_error)
			return error;
	}

	sum4 = 0;

	for (i=0; i<w4; i++)
	{
		if ((error = read_wave_sample(src, & buf)) != no_error)
			return error;
		sum4 += buf;
	}

	if (cur_state)
	{
		rating_0 = sum2 + sum4;
		rating_1 = sum2 - sum4;
	}
	else
	{
		rating_0 = - (sum2 + sum4);
		rating_1 = - (sum2 - sum4);
	}

	if (rating_1 > rating_0)
	{
		*reply = 1;
		cur_state = ! cur_state;
	}
	else
		*reply = 0;

	return no_error;
}

#elif (DECODING_ALGORITHM == 1)
/*
	Read one bit in tape (experimental variant, with single level optimization)
*/
#define BUF_LEN 1024

static error_t read_tape_bit(wave_file_in_t *src, int *reply)
{
	error_t error;
	int i, j;
	int i_max_0, i_max_1, i_max;
	float sum;
	float cur_rating_0, cur_rating_1, cur_rating_1_save;
	float max_rating_0, max_rating_1;

	/* buffer */
	static float buf[BUF_LEN];
	static uint_fast32_t cur_buf_pos = 0;
	static int cur_eof_pos = -1;


	if (cur_buf_pos == 0)
	{	/* fill buffer */
		for (i=0; i<BUF_LEN; i++)
		{
			error = read_wave_sample(src, & buf[i]);
			if (error)
			{
				if (error == eof_error)
				{
					if (cur_eof_pos == -1)
						cur_eof_pos = cur_buf_pos+i;
					buf[i] = buf[i-1];
				}
				else
					return error;
			}
		}
	}

	if ((cur_eof_pos != -1) && (cur_buf_pos+w4 > cur_eof_pos))
		return eof_error;


	/*
		Initially compute convolution with the 2 following functions:

		                 b
		    _____________
		                 |____		(B0)
		    ^  w1   w2   w3   w4
		    |
		tape's flux change
		       a         b
		    ___           _____
		       |_________|			(B1)
		    ^  w1   w2   w3   w4
		    |
		tape's flux change
	*/
	sum = 0.;
	for (i=0; i<w1; i++)
		sum += buf[i];

	cur_rating_0 = cur_rating_1 = sum;

	sum = 0.;
	for (i=w1; i<w3; i++)
		sum += buf[i];

	cur_rating_0 += sum;
	cur_rating_1 -= sum;

	sum = 0.;
	for (i=w3; i<w4; i++)
		sum += buf[i];

	cur_rating_0 -= sum;
	cur_rating_1 += sum;

	max_rating_0 = cur_rating_0;
	max_rating_1 = cur_rating_1_save = cur_rating_1;
	i_max_0 = i_max_1 = w3;

	/*
		Iteratively compute convolution for each a and b with ((w1<=a<w2) and (w3<=b<w4)).

		We keep track of the maximal (tape initially high) or minimal (tape initially low)
		values.
	*/
	if (cur_state)
	{
		for (i=w3; i<w4; i++)
		{
			cur_rating_0 += buf[i]+buf[i];
			if (cur_rating_0 > max_rating_0)
			{
				max_rating_0 = cur_rating_0;
				i_max_0 = i+1;
			}
			cur_rating_1_save -= buf[i]+buf[i];
			cur_rating_1 = cur_rating_1_save;
			for (j=w1; j<w2; j++)
			{
				cur_rating_1 += buf[j]+buf[j];
				if (cur_rating_1 > max_rating_1)
				{
					max_rating_1 = cur_rating_1;
					i_max_1 = i+1;
				}
			}
		}

		if (max_rating_1 > max_rating_0)
		{
			*reply = 1;
			i_max = i_max_1;
		}
		else
		{
			*reply = 0;
			cur_state = ! cur_state;
			i_max = i_max_0;
		}
	}
	else
	{
		for (i=w3; i<w4; i++)
		{
			cur_rating_0 += buf[i]+buf[i];
			if (cur_rating_0 < max_rating_0)
			{
				max_rating_0 = cur_rating_0;
				i_max_0 = i+1;
			}
			cur_rating_1_save -= buf[i]+buf[i];
			cur_rating_1 = cur_rating_1_save;
			for (j=w1; j<w2; j++)
			{
				cur_rating_1 += buf[j]+buf[j];
				if (cur_rating_1 < max_rating_1)
				{
					max_rating_1 = cur_rating_1;
					i_max_1 = i+1;
				}
			}
		}

		if (max_rating_1 < max_rating_0)
		{
			*reply = 1;
			i_max = i_max_1;
		}
		else
		{
			*reply = 0;
			cur_state = ! cur_state;
			i_max = i_max_0;
		}
	}

	/* update buffer */
	/* The value of b where the maximum is reached ("i_max") will be the starting point
	("flux change") to decode next bit.*/
	cur_buf_pos += i_max;

	for (i=0; i<BUF_LEN-i_max; i++)
		buf[i] = buf[i_max+i];

	for (i=BUF_LEN-i_max; i<BUF_LEN; i++)
	{
		error = read_wave_sample(src, & buf[i]);
		if (error)
		{
			if (error == eof_error)
			{
				if (cur_eof_pos == -1)
					cur_eof_pos = cur_buf_pos+i;
				buf[i] = buf[i-1];
			}
			else
				return error;
		}
	}

	return no_error;
}


#else
/*
	Read one bit in tape (experimental variant, with variable-width optimization window)
*/
#define BUF_LEN 1024
#define MAX_DEPTH 10

static error_t read_tape_bit(wave_file_in_t *src, int max_bits, int *reply)
{
	error_t error;
	int depth;
	int bit_seq;
	int i, j, k;
	float sum;
	float cur_rating;
	float max_rating;
	int bit_end_max;
	int assumed_state;
	int state_transition[MAX_DEPTH*2];

	int i_save[MAX_DEPTH*2-1];
	int i0_save[MAX_DEPTH*2-1], i1_save[MAX_DEPTH*2-1], i2_save[MAX_DEPTH*2-1];
	int i0, i1, i2;
	float cur_rating_save[MAX_DEPTH*2];

	int i0_prep[MAX_DEPTH*2], i1_prep[MAX_DEPTH*2], i2_prep[MAX_DEPTH*2];
	int end_level;

	/* buffer */
	static float buf[BUF_LEN];
	static uint_fast32_t cur_buf_pos = 0;
	static int cur_eof_pos = -1;
	static int w_tot_fix;


	if (cur_buf_pos == 0)
	{	/* fill buffer */
		for (i=0; i<BUF_LEN; i++)
		{
			error = read_wave_sample(src, & buf[i]);
			if (error)
			{
				if (error == eof_error)
				{
					if (cur_eof_pos == -1)
						cur_eof_pos = cur_buf_pos+i;
					buf[i] = buf[i-1];
				}
				else
					return error;
			}
		}
	}

	if ((cur_eof_pos != -1) && (cur_buf_pos+w_tot > cur_eof_pos))
	{
		if (cur_eof_pos - cur_buf_pos >= 2*w_max)
			w_tot_fix = cur_eof_pos - cur_buf_pos;
		else
			return eof_error;
	}
	else
		w_tot_fix = w_tot;


	/* optimization depth */
	depth = ceil(((double) w_tot_fix / w_min) / 2);
	max_rating = - FLT_MAX;

	/* test each possible bit sequence */
	for (bit_seq = 0; bit_seq < (1 << depth); bit_seq++)
	{
		k = 0;
		assumed_state = cur_state;
		cur_rating = 0.;

		/* compute cur_rating */
		for (j=0; (j<depth) && (j<max_bits); j++)
		{
			assumed_state = ! assumed_state;

			sum = 0.;
			for (i=0; (i<w_min) && (k<w_tot_fix); i++, k++)
				sum += buf[k];

			if (assumed_state)
				cur_rating += sum;
			else
				cur_rating -= sum;

			if ((bit_seq >> j) & 1)
				assumed_state = ! assumed_state;

			sum = 0.;
			for (i=0; (i<w_min) && (k<w_tot_fix); i++, k++)
				sum += buf[k];

			if (assumed_state)
				cur_rating += sum;
			else
				cur_rating -= sum;
		}
		if (j<depth)
		{
			sum = 0.;
			while (k<w_tot_fix)
				sum += buf[k++];

			if (assumed_state)
				cur_rating += sum;
			else
				cur_rating -= sum;
		}

		if (cur_rating > max_rating)
		{
			max_rating = cur_rating;
			*reply = bit_seq & 1;
			bit_end_max = w_min+w_min;
		}

		end_level = 0;
		for (j=0, k=0, assumed_state = cur_state; j<depth; j++)
		{
			if (j<max_bits)
			{
				assumed_state = ! assumed_state;
				if (k)
					state_transition[k-1] = assumed_state ? 1 : -1;
			}
			else
				state_transition[k-1] = 0;
			if ((bit_seq >> j) & 1)
			{
				i0_prep[k] = w_min;
				i1_prep[k] = w_tot_fix-w_min;
				i2_prep[k] = w_max;
				k++;
				if (j<max_bits)
				{
					assumed_state = ! assumed_state;
					state_transition[k-1] = assumed_state ? 1 : -1;
				}
				else
					state_transition[k-1] = 0;
				i0_prep[k] = w_min;
				i1_prep[k] = w_tot_fix-(((bit_seq >> (j+1)) & 1) ? w_min : (w_min+w_min));
				i2_prep[k] = w_max;
				k++;
			}
			else
			{
				i0_prep[k] = w_min+w_min;
				i1_prep[k] = w_tot_fix-(((bit_seq >> (j+1)) & 1) ? w_min : (w_min+w_min));
				i2_prep[k] = w_max+w_max;
				k++;
			}

			/* has the truncated sequence (first j bits of bit_seq) already been tested? */
			if (bit_seq & (1 << (j+1)))
				end_level = k;
		}

		k = 0;
		/*i = 0;*/

		i0 = i0_prep[0];

		i2 = i2_prep[0];
		if (i2 > w_tot_fix)
			i2 = w_tot_fix;

		i1 = i1_prep[0];
		if (i1 > i2)
			i1 = i2;

		i = i0;

		cur_rating_save[/*k*/0] = cur_rating;

		while (1)
		{
			if (i<i1)
			{
				i_save[k] = i;
				i0_save[k] = i0;
				i1_save[k] = i1;
				i2_save[k] = i2;

				k++;

				i0 = i+i0_prep[k];

				i2 = i+i2_prep[k];
				if (i2 > w_tot_fix)
					i2 = w_tot_fix;

				i1 = i1_prep[k];
				if (i1 > i2)
					i1 = i2;

				i = i0;

				cur_rating_save[k] = cur_rating;
			}
			else if (i<i2)
			{
				if (state_transition[k] > 0)
					cur_rating -= buf[i]+buf[i];
				else if (state_transition[k] < 0)
					cur_rating += buf[i]+buf[i];

				if (i == i0)
					cur_rating_save[k] = cur_rating;

				i++;

				if (cur_rating > max_rating)
				{
					max_rating = cur_rating;
					*reply = bit_seq & 1;
					bit_end_max = (k == (bit_seq & 1)) ? i : i_save[bit_seq & 1];
				}
			}
			else
			{
				/* has this truncated bit_seq already been tested? */
				if ((i == w_tot_fix) &&(k<=end_level))
					break;

				k--;
				if (k<0)
					break;

				i = i_save[k];
				i0 = i0_save[k];
				i1 = i1_save[k];
				i2 = i2_save[k];

				cur_rating = cur_rating_save[k+1];

				if (state_transition[k] > 0)
					cur_rating -= buf[i]+buf[i];
				else if (state_transition[k] < 0)
					cur_rating += buf[i]+buf[i];

				if (i == i0)
					cur_rating_save[k] = cur_rating;

				i++;

				if (cur_rating > max_rating)
				{
					max_rating = cur_rating;
					*reply = bit_seq & 1;
					bit_end_max = (k == (bit_seq & 1)) ? i : (k == 0) ? i+w_min : i_save[bit_seq & 1];
				}
			}
		}
	}

	/* update cur_state */
	if (*reply == 0)
		cur_state = ! cur_state;

	/* update buffer */
	/* The value of b where the maximum is reached ("bit_end_max") will be the starting point
	("flux change") to decode next bit.*/
	cur_buf_pos += bit_end_max;

	for (i=0; i<BUF_LEN-bit_end_max; i++)
		buf[i] = buf[bit_end_max+i];

	for (i=BUF_LEN-bit_end_max; i<BUF_LEN; i++)
	{
		error = read_wave_sample(src, & buf[i]);
		if (error)
		{
			if (error == eof_error)
			{
				if (cur_eof_pos == -1)
					cur_eof_pos = cur_buf_pos+i;
				buf[i] = buf[i-1];
			}
			else
				return error;
		}
	}

	return no_error;
}

#endif
