views:

329

answers:

1

Question: I am trying to use the espeak text-to-speech engine. So for I got it working wounderfully on linux (code below). Now I wanted to port this basic program to windows, too, but it's nearly impossible...

Part of the problem is that the windows dll only allows for AUDIO_OUTPUT_SYNCHRONOUS, which means it requires a callback, but I can't figure out how to play the audio from the callback... First it crashed, then I realized, I need a callback function, now I get the data in the callback function, but I don't know how to play it... as it is neither a wav file nor plays automatically as on Linux.

The sourceforge site is rather useless, because it basically says use the SAPI version, but then there is no example on how to use the sapi espeak dll...

Anyway, here's my code, can anybody help?

#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>

else

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

endif

include

include

//#include "speak_lib.h"

include "espeak/speak_lib.h"

// libespeak-dev: /usr/include/espeak/speak_lib.h // apt-get install libespeak-dev // apt-get install libportaudio-dev

// g++ -o mine mine.cpp -lespeak // g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak // gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak

char voicename[40]; int samplerate; int quiet = 0; static char genders[4] = {' ','M','F',' '};

//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/ const char *data_path = NULL; // use default path for espeak-data

int strrcmp(const char *s, const char *sub) { int slen = strlen(s); int sublen = strlen(sub); return memcmp(s + slen - sublen, sub, sublen); }

char * strrcpy(char *dest, const char *source) { // Pre assertions assert(dest != NULL); assert(source != NULL); assert(dest != source);

// tk: parentheses
while((*dest++ = *source++))
    ;
return(--dest);

}

const char* GetLanguageVoiceName(const char* pszShortSign) { #define LANGUAGE_LENGTH 30 static char szReturnValue[LANGUAGE_LENGTH] ; memset(szReturnValue, 0, LANGUAGE_LENGTH);

for (int i = 0; pszShortSign[i] != '\0'; ++i)
    szReturnValue[i] = (char) tolower(pszShortSign[i]);

const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);

const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
    if( !strrcmp( v->languages, szReturnValue) )
    {
        strcpy(szReturnValue, v->name);
        return szReturnValue;
    }
} // End for

strcpy(szReturnValue, "default");
return szReturnValue;

} // End function getvoicename

void ListVoices() { const espeak_VOICE **voices; espeak_VOICE voice_select; voices = espeak_ListVoices(NULL);

const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
    printf("Shortsign: %s\n", v->languages);
    printf("age: %d\n", v->age);
    printf("gender: %c\n", genders[v->gender]);
    printf("name: %s\n", v->name);
    printf("\n\n");
} // End for

} // End function getvoicename

int main() { printf("Hello World!\n"); const char* szVersionInfo = espeak_Info(NULL);

printf("Espeak version: %s\n", szVersionInfo);
samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);

strcpy(voicename, "default");
// espeak --voices
strcpy(voicename, "german");
strcpy(voicename, GetLanguageVoiceName("DE"));

if(espeak_SetVoiceByName(voicename) != EE_OK)
{
    printf("Espeak setvoice error...\n");
}

static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100    0=silence
int pitch = 50; //  base pitch, range 0-100.  50=normal

// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE:   pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION:  which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some), 
espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;

espeak_SetVoiceByProperties(voice_spec);


espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();

strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");

espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();


espeak_Terminate();
printf("Espeak terminated\n");
return EXIT_SUCCESS; 

}

/* if(espeak_SetVoiceByName(voicename) != EE_OK) { memset(&voice_select,0,sizeof(voice_select)); voice_select.languages = voicename; if(espeak_SetVoiceByProperties(&voice_select) != EE_OK) { fprintf(stderr,"%svoice '%s'\n",err_load,voicename); exit(2); } } */

The above code is for Linux. The below code is about as far as I got on Vista x64 (32 bit emu):

#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>

else

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

endif

include

include

include "speak_lib.h"

//#include "espeak/speak_lib.h"

// libespeak-dev: /usr/include/espeak/speak_lib.h // apt-get install libespeak-dev // apt-get install libportaudio-dev

// g++ -o mine mine.cpp -lespeak // g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak // gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak

char voicename[40]; int iSampleRate; int quiet = 0; static char genders[4] = {' ','M','F',' '};

//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/ //const char *data_path = NULL; // use default path for espeak-data const char *data_path = "C:\Users\Username\Desktop\espeak-1.43-source\espeak-1.43-source\";

int strrcmp(const char *s, const char *sub) { int slen = strlen(s); int sublen = strlen(sub); return memcmp(s + slen - sublen, sub, sublen); }

char * strrcpy(char *dest, const char *source) { // Pre assertions assert(dest != NULL); assert(source != NULL); assert(dest != source);

// tk: parentheses
while((*dest++ = *source++))
    ;
return(--dest);

}

const char* GetLanguageVoiceName(const char* pszShortSign) { #define LANGUAGE_LENGTH 30 static char szReturnValue[LANGUAGE_LENGTH] ; memset(szReturnValue, 0, LANGUAGE_LENGTH);

for (int i = 0; pszShortSign[i] != '\0'; ++i)
    szReturnValue[i] = (char) tolower(pszShortSign[i]);

const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);

const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
    if( !strrcmp( v->languages, szReturnValue) )
    {
        strcpy(szReturnValue, v->name);
        return szReturnValue;
    }
} // End for

strcpy(szReturnValue, "default");
return szReturnValue;

} // End function getvoicename

void ListVoices() { const espeak_VOICE **voices; espeak_VOICE voice_select; voices = espeak_ListVoices(NULL);

const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
    printf("Shortsign: %s\n", v->languages);
    printf("age: %d\n", v->age);
    printf("gender: %c\n", genders[v->gender]);
    printf("name: %s\n", v->name);
    printf("\n\n");
} // End for

} // End function getvoicename

/* Callback from espeak. Directly speaks using AudioTrack. */

define LOGI(x) printf("%s\n", x)

static int AndroidEspeakDirectSpeechCallback(short *wav, int numsamples, espeak_EVENT *events) { char buf[100]; sprintf(buf, "AndroidEspeakDirectSpeechCallback: %d samples", numsamples); LOGI(buf);

if (wav == NULL) 
{
    LOGI("Null: speech has completed");
}

  if (numsamples > 0)
  {
    //audout->write(wav, sizeof(short) * numsamples);
    sprintf(buf, "AudioTrack wrote: %d bytes", sizeof(short) * numsamples);
    LOGI(buf);
  }

return 0;  // continue synthesis (1 is to abort)

}

static int AndroidEspeakSynthToFileCallback(short *wav, int numsamples,espeak_EVENT *events) { char buf[100]; sprintf(buf, "AndroidEspeakSynthToFileCallback: %d samples", numsamples); LOGI(buf);

if (wav == NULL) 
{
    LOGI("Null: speech has completed");
}

// The user data should contain the file pointer of the file to write to
//void* user_data = events->user_data;
FILE* user_data = fopen ( "myfile1.wav" , "ab" );

FILE* fp = static_cast<FILE *>(user_data);

// Write all of the samples
fwrite(wav, sizeof(short), numsamples, fp);
return 0;  // continue synthesis (1 is to abort)

}

int main() { printf("Hello World!\n"); const char* szVersionInfo = espeak_Info(NULL);

printf("Espeak version: %s\n", szVersionInfo);

iSampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, data_path, 0);
if (iSampleRate <= 0) 
{
    printf("Unable to initialize espeak");
    return EXIT_FAILURE;
}

//samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);

//ListVoices();

strcpy(voicename, "default");
// espeak --voices
//strcpy(voicename, "german");
//strcpy(voicename, GetLanguageVoiceName("DE"));

if(espeak_SetVoiceByName(voicename) != EE_OK)
{
    printf("Espeak setvoice error...\n");
}

static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100    0=silence
int pitch = 50; //  base pitch, range 0-100.  50=normal


// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE:   pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION:  which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some), 
//espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
//voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;

//espeak_SetVoiceByProperties(voice_spec);

//espeak_SetSynthCallback(AndroidEspeakDirectSpeechCallback);
espeak_SetSynthCallback(AndroidEspeakSynthToFileCallback);

unsigned int unique_identifier;
espeak_ERROR err = espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, &unique_identifier, NULL);

err = espeak_Synchronize();



/*
strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");

espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
*/

// espeak_Cancel();
espeak_Terminate();
printf("Espeak terminated\n");
system("pause");
return EXIT_SUCCESS; 

}

A: 

Have U tried passing the buffer U obtain in ur callback to sndplaysnd()??

Declare Function sndPlaySound Lib "winmm.dll" Alias "sndPlaySoundA" (ByVal lpszSoundName As String, ByVal uFlags As Long) As Long

Its standard winAPI is as follows:

  sndPlaySound(buffer[0], SND_ASYNC | SND_MEMORY)

Alternately, if U hav a wav-file that has the audio to play:

  sndPlaySound(filename, SND_ASYNC)

playsound has a ASYNC mode that wouldn't block ur program's execution while the audio is being played.

NOTE: I have used it in VB and the above snippets are for use in VB. If U are coding in VC++, U might have to modify them accordingly. But the basic intention remains the same; to pass the buffer to sndPlaySound with the ASYNC flag set.

Good LUCK!!

CVS-2600Hertz
sndPlaySound(wav, SND_ASYNC | SND_MEMORY) ;Thanks, it requires to link to winmm.lib in C++, but I tried and it doesn't work (no crash, but no sound either, I also tried SND_)...sndPlaySound(wav[0], SND_ASYNC | SND_MEMORY) ; crashes
Quandary
Since U are using this, FILE* user_data = fopen ( "myfile1.wav" , "ab" ); does the output lie in myfile1.wav. Tried playing that using sndPlaySound(filename, SND_ASYNC)??. Try playing the myfile1.wav in ur media-payer and check if it has any speech data in it. I'm curious...
CVS-2600Hertz
It's not a wav file, it's the data only... it doesn't even play in mediaplayer
Quandary
open myfile1.wav in notepad or some hexeditor and check for the header. Contains the letters "RIFF" in ASCII form (0x52494646 big-endian form). The wav PCM format:https://ccrma.stanford.edu/courses/422/projects/WaveFormat/ I am assuming the myfile1.wav should contain the output synthesized audio. Am i right?
CVS-2600Hertz
It's the pure data, see here:http://eyes-free.googlecode.com/svn-history/r245/trunk/native/frameworks/tts/jni/com_google_tts_SpeechSynthesis.cppIt's necessary to write the header oneselfs.
Quandary
oh. in that case, plz proceed to prefix the header and all. AFAIK, Even the memory-buffer being passed to sndplaysound is expected to hav the header and all. SndPlaySound() expects a "cooked" buffer NOT raw PCM samples. ( i suppose, the buffer[] facility in SndPlaySound() was specifically designed to be able to directly load and play wav files packed inside res files in exe-s) So preparing the wav properly is whats required i think...
CVS-2600Hertz