diff --git a/README.md b/README.md index e346ab6..593d923 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ if __name__ == "__main__": ```python import os -from aiola import AiolaClient +from aiola import AiolaClient, VoiceId def create_file(): try: @@ -230,7 +230,7 @@ def create_file(): audio = client.tts.synthesize( text='Hello, how can I help you today?', - voice_id='en_us_male' + voice_id=VoiceId.EnglishUSMale # Type-safe enum ) with open('./audio.wav', 'wb') as f: @@ -248,7 +248,7 @@ create_file() ```python import os -from aiola import AiolaClient +from aiola import AiolaClient, VoiceId def stream_tts(): try: @@ -262,7 +262,7 @@ def stream_tts(): stream = client.tts.stream( text='Hello, how can I help you today?', - voice_id='en_us_male' + voice_id=VoiceId.EnglishUSMale # Type-safe enum ) audio_chunks = [] @@ -314,7 +314,7 @@ if __name__ == "__main__": ```python import asyncio import os -from aiola import AsyncAiolaClient +from aiola import AsyncAiolaClient, VoiceId async def create_audio_file(): try: @@ -328,7 +328,7 @@ async def create_audio_file(): audio = client.tts.synthesize( text='Hello, how can I help you today?', - voice_id='en_us_male' + voice_id=VoiceId.EnglishUSMale # Type-safe enum ) with open('./audio.wav', 'wb') as f: @@ -348,7 +348,7 @@ if __name__ == "__main__": ```python import asyncio import os -from aiola import AsyncAiolaClient +from aiola import AsyncAiolaClient, VoiceId async def stream_tts(): try: @@ -362,7 +362,7 @@ async def stream_tts(): stream = client.tts.stream( text='Hello, how can I help you today?', - voice_id='en_us_male' + voice_id=VoiceId.EnglishUSMale # Type-safe enum ) audio_chunks = [] @@ -385,3 +385,609 @@ if __name__ == "__main__": ## Examples The SDK includes several example scripts in the `examples/` directory. + +--- + +## API Reference + +### AiolaClient + +Main client for interacting with the Aiola API (synchronous). + +#### Static Methods + +##### `AiolaClient.grant_token(api_key, auth_base_url=None, workflow_id=None)` + +Generates an access token from an API key. + +**Parameters:** +- `api_key` (str, required): Your Aiola API key +- `auth_base_url` (str, optional): Custom authentication base URL +- `workflow_id` (str, optional): Custom workflow ID + +**Returns:** `GrantTokenResponse` +- `access_token` (str): JWT access token for API authentication +- `session_id` (str): Unique session identifier + +##### `AiolaClient.close_session(access_token, auth_base_url=None)` + +Closes a session on the server and frees up concurrency slots. + +**Parameters:** +- `access_token` (str, required): The access token for the session +- `auth_base_url` (str, optional): Custom authentication base URL + +**Returns:** `SessionCloseResponse` + +#### Constructor + +##### `AiolaClient(api_key=None, access_token=None, base_url=None, auth_base_url=None, workflow_id=None, timeout=150)` + +Creates a new AiolaClient instance. Either `api_key` or `access_token` must be provided. + +**Parameters:** +- `api_key` (str, optional): Your Aiola API key for automatic token management +- `access_token` (str, optional): Pre-generated access token from `grant_token()` +- `base_url` (str, optional): Custom API base URL +- `auth_base_url` (str, optional): Custom authentication base URL +- `workflow_id` (str, optional): Custom workflow ID +- `timeout` (float, optional): HTTP request timeout in seconds (default: 150) + +#### Properties + +##### `client.stt` + +Gets the Speech-to-Text (STT) client. + +**Type:** `SttClient` + +##### `client.tts` + +Gets the Text-to-Speech (TTS) client. + +**Type:** `TtsClient` + +##### `client.auth` + +Gets the authentication client (internal use). + +**Type:** `AuthClient` + +##### `client.options` + +Gets the client configuration options. + +**Type:** `AiolaClientOptions` + +--- + +### AsyncAiolaClient + +Main client for interacting with the Aiola API (asynchronous). Same API as `AiolaClient` but with async/await support. + +--- + +### STT Client + +Speech-to-Text client for audio transcription. + +#### Methods + +##### `client.stt.transcribe_file(file, language=None, keywords=None, vad_config=None)` + +Transcribes an audio file to text. + +**Parameters:** +- `file` (File, required): Audio file (file path, file object, or bytes). Formats: WAV, MP3, M4A, OGG, FLAC +- `language` (str, optional): Language code (e.g., 'en', 'es', 'fr') +- `keywords` (dict, optional): Keywords map for boosting recognition +- `vad_config` (VadConfig, optional): Voice Activity Detection configuration + +**Returns:** `TranscriptionResponse` +- `transcript` (str): Complete transcription with formatting +- `raw_transcript` (str): Raw transcription without post-processing +- `segments` (list[Segment]): Time segments where speech was detected +- `metadata` (TranscriptionMetadata): File metadata and transcription info + +##### `client.stt.stream(workflow_id=None, execution_id=None, lang_code=None, time_zone=None, keywords=None, tasks_config=None, vad_config=None)` + +Creates a real-time streaming connection for audio transcription. + +**Parameters:** +- `lang_code` (str, optional): Language code (e.g., 'en', 'es', 'fr') +- `workflow_id` (str, optional): Custom workflow ID +- `execution_id` (str, optional): Execution ID for tracking (auto-generated if not provided) +- `time_zone` (str, optional): Timezone for timestamps (default: 'UTC') +- `keywords` (dict, optional): Keywords map for boosting recognition +- `tasks_config` (TasksConfig, optional): AI tasks configuration +- `vad_config` (VadConfig, optional): Voice Activity Detection configuration + +**Returns:** `StreamConnection` + +**Audio format requirements:** 16-bit PCM, 16kHz sample rate, mono channel + +--- + +### StreamConnection + +Real-time audio streaming connection for transcription. + +#### Methods + +##### `stream.connect()` + +Establishes the Socket.IO connection. + +##### `stream.disconnect()` + +Closes the Socket.IO connection. + +##### `stream.send(data)` + +Sends audio data to the streaming service. + +**Parameters:** +- `data` (bytes, required): Audio data (16-bit PCM, 16kHz, mono) + +##### `stream.on(event, handler=None)` + +Registers an event listener. Can be used as a decorator. + +**Available events:** +- `LiveEvents.Transcript`: Real-time transcription results +- `LiveEvents.Structured`: Structured data extraction results +- `LiveEvents.Translation`: Translation results (if enabled) +- `LiveEvents.Connect`: Connection established +- `LiveEvents.Disconnect`: Connection closed +- `LiveEvents.Error`: Error occurred + +##### `stream.set_keywords(keywords)` + +Sets or updates keywords for recognition boosting. + +**Parameters:** +- `keywords` (dict, required): Map of spoken phrases to written forms + +#### Properties + +##### `stream.connected` + +Indicates whether the client is currently connected. + +**Type:** `bool` + +--- + +### TTS Client + +Text-to-Speech client for converting text to spoken audio. + +#### Methods + +##### `client.tts.synthesize(text, voice_id)` + +Synthesizes text to speech with complete generation. + +**Parameters:** +- `text` (str, required): Text to convert to speech +- `voice_id` (VoiceId | str, required): Voice identifier + +**Supported voice IDs (VoiceId enum):** + +| Enum Value | String Value | Description | +|------------|--------------|-------------| +| `VoiceId.EnglishUSFemale` | `'en_us_female'` | English (US) Female | +| `VoiceId.EnglishUSMale` | `'en_us_male'` | English (US) Male | +| `VoiceId.SpanishFemale` | `'es_female'` | Spanish Female | +| `VoiceId.SpanishMale` | `'es_male'` | Spanish Male | +| `VoiceId.FrenchFemale` | `'fr_female'` | French Female | +| `VoiceId.FrenchMale` | `'fr_male'` | French Male | +| `VoiceId.GermanFemale` | `'de_female'` | German Female | +| `VoiceId.GermanMale` | `'de_male'` | German Male | +| `VoiceId.JapaneseFemale` | `'ja_female'` | Japanese Female | +| `VoiceId.JapaneseMale` | `'ja_male'` | Japanese Male | +| `VoiceId.PortugueseFemale` | `'pt_female'` | Portuguese Female | +| `VoiceId.PortugueseMale` | `'pt_male'` | Portuguese Male | + +**Returns:** `Iterator[bytes]` + +**Example:** +```python +from aiola import VoiceId + +# Using enum (recommended - type-safe with autocomplete) +audio = client.tts.synthesize( + text='Hello, world!', + voice_id=VoiceId.EnglishUSFemale +) + +# Or using string +audio = client.tts.synthesize( + text='Hello, world!', + voice_id='en_us_female' +) +``` + +##### `client.tts.stream(text, voice_id)` + +Synthesizes text to speech with streaming delivery (lower latency). + +**Parameters:** +- `text` (str, required): Text to convert to speech +- `voice_id` (VoiceId | str, required): Voice identifier (see supported voices above) + +**Returns:** `Iterator[bytes]` + +--- + +### MicrophoneStream + +Utility for streaming audio from the microphone (requires `aiola[mic]` extra). + +#### Constructor + +##### `MicrophoneStream(channels=1, samplerate=16000, blocksize=4096, device=None)` + +**Parameters:** +- `channels` (int, optional): Number of audio channels (default: 1) +- `samplerate` (int, optional): Sample rate in Hz (default: 16000) +- `blocksize` (int, optional): Audio block size (default: 4096) +- `device` (int/str, optional): Audio device to use + +#### Methods + +##### `mic.start()` + +Starts recording from the microphone. + +##### `mic.stop()` + +Stops recording. + +##### `mic.read(timeout=None)` + +Reads audio data from the microphone. + +**Returns:** `bytes` + +##### `mic.stream_to(connection)` + +Streams microphone audio to a StreamConnection. + +##### `mic.stream_with_callback(callback)` + +Streams microphone audio with a callback function. + +##### `mic.list_devices()` (classmethod) + +Lists available audio devices. + +**Returns:** List of audio devices + +#### Properties + +##### `mic.is_recording` + +Indicates whether currently recording. + +**Type:** `bool` + +--- + +### Streaming Events Reference + +#### `LiveEvents.Transcript` + +Emitted when new transcription text is available. + +**Payload:** +```python +{ + 'transcript': str # The transcribed text +} +``` + +#### `LiveEvents.Structured` + +Emitted when structured data is extracted (requires FORM_FILLING task). + +**Payload:** +```python +{ + 'results': dict # Structured data extracted +} +``` + +#### `LiveEvents.Translation` + +Emitted when translation is available (requires TRANSLATION task). + +**Payload:** +```python +{ + 'translation': str # The translated text +} +``` + +#### Connection Events + +- `LiveEvents.Connect`: Fired when Socket.IO connection is established +- `LiveEvents.Disconnect`: Fired when Socket.IO connection is closed +- `LiveEvents.Error`: Fired when an error occurs (receives error object) + +--- + +### Error Handling + +All errors thrown by the SDK inherit from `AiolaError`. + +#### Error Classes + +| Error Class | Description | When Raised | +|-------------|-------------|-------------| +| `AiolaError` | Base error class | Base for all SDK errors | +| `AiolaConnectionError` | Network connectivity issues | Network errors, timeouts, DNS failures | +| `AiolaAuthenticationError` | Authentication failures | Invalid API key, expired token (401) | +| `AiolaValidationError` | Invalid parameters | Missing required fields, wrong types | +| `AiolaStreamingError` | Streaming issues | WebSocket connection problems | +| `AiolaFileError` | File operation failures | Invalid file format, file not found | +| `AiolaRateLimitError` | Rate limit exceeded | Too many requests (429) | +| `AiolaServerError` | Server-side errors | Internal server errors (5xx) | + +#### Error Attributes + +All error classes have these attributes: +- `message` (str): Human-readable error description +- `reason` (str): Detailed explanation from server +- `status` (int): HTTP status code +- `code` (str): Machine-readable error code +- `details` (Any): Additional diagnostic information + +#### Common Error Codes + +| Code | Description | How to Handle | +|------|-------------|---------------| +| `TOKEN_EXPIRED` | Access token has expired | Generate a new token | +| `INVALID_TOKEN` | Access token is invalid | Verify API key and generate new token | +| `MAX_CONCURRENCY_REACHED` | Too many concurrent sessions | Close existing sessions or wait | +| `INVALID_AUDIO_FORMAT` | Unsupported audio format | Use WAV, MP3, M4A, OGG, or FLAC | +| `RATE_LIMIT_EXCEEDED` | Too many requests | Implement exponential backoff | +| `WORKFLOW_NOT_FOUND` | Workflow ID doesn't exist | Verify workflow ID | +| `UNAUTHORIZED` | Invalid API key | Check API key and permissions | +| `VALIDATION_ERROR` | Invalid parameters | Check parameter types and values | + +#### Error Handling Examples + +```python +from aiola import AiolaClient, AiolaAuthenticationError, AiolaRateLimitError +import time + +# Handle specific errors +try: + result = client.stt.transcribe_file('audio.wav') +except AiolaAuthenticationError as e: + if e.code == 'TOKEN_EXPIRED': + # Refresh token and retry + token = AiolaClient.grant_token(api_key='your-key') + client = AiolaClient(access_token=token.access_token) + result = client.stt.transcribe_file('audio.wav') +except AiolaRateLimitError as e: + # Wait and retry + time.sleep(60) + result = client.stt.transcribe_file('audio.wav') + +# General error handling +try: + result = client.stt.transcribe_file('audio.wav') +except AiolaError as e: + print(f"Error: {e.message}") + print(f"Code: {e.code}") + print(f"Status: {e.status}") + print(f"Reason: {e.reason}") +``` + +--- + +### Audio Format Requirements + +#### File Transcription + +**Supported formats:** WAV, MP3, M4A, OGG, FLAC + +**Recommended specifications:** +- Sample rate: 16kHz or higher +- Channels: Mono or stereo +- Bit depth: 16-bit + +#### Streaming Audio + +**Required specifications:** +- Format: 16-bit PCM (raw audio) +- Sample rate: 16kHz +- Channels: Mono (1 channel) +- Encoding: Little-endian + +**Chunk size recommendations:** +- Minimum: 100ms of audio (~3,200 bytes) +- Recommended: 100-500ms chunks +- Maximum: 1000ms per chunk + +--- + +### Voice Activity Detection (VAD) + +Configure how speech and silence are detected in audio streams. + +**Configuration options:** + +```python +from aiola.types import VadConfig + +vad_config = VadConfig( + threshold=0.6, # Detection threshold (0.0-1.0, default: ~0.5) + min_speech_ms=300, # Minimum speech duration (default: ~250ms) + min_silence_ms=700, # Minimum silence to split segments (default: ~500ms) + max_segment_ms=15000 # Maximum segment duration (default: ~30000ms) +) +``` + +--- + +### AI Tasks Configuration + +Enable AI-powered analysis tasks during transcription. + +**Example:** + +```python +from aiola.types import TasksConfig, TranslationPayload + +tasks_config = TasksConfig( + TRANSLATION=TranslationPayload( + src_lang_code='en', + dst_lang_code='es' + ) +) + +stream = client.stt.stream( + lang_code='en', + tasks_config=tasks_config +) +``` + +--- + +### Sync vs Async Usage + +#### When to Use Sync (`AiolaClient`) + +- Simple scripts and applications +- Blocking I/O is acceptable +- Working with synchronous libraries +- Traditional Python development + +#### When to Use Async (`AsyncAiolaClient`) + +- High-concurrency applications +- Web servers (FastAPI, aiohttp) +- Multiple concurrent API calls +- Need for non-blocking I/O +- Integration with async libraries + +**Example: Concurrent Transcriptions** + +```python +import asyncio +from aiola import AsyncAiolaClient + +async def transcribe_multiple_files(): + token = await AsyncAiolaClient.grant_token(api_key='your-key') + client = AsyncAiolaClient(access_token=token.access_token) + + files = ['audio1.wav', 'audio2.wav', 'audio3.wav'] + + # Transcribe all files concurrently + tasks = [ + client.stt.transcribe_file(file, language='en') + for file in files + ] + + results = await asyncio.gather(*tasks) + return results + +# Run async function +results = asyncio.run(transcribe_multiple_files()) +``` + +--- + +## Troubleshooting + +### Connection Issues + +**Problem:** Connection errors or timeouts + +**Solutions:** +- Verify network connection +- Check firewall settings for WebSocket connections +- Ensure you're using a valid access token +- Try increasing timeout parameter + +### Audio Quality Issues + +**Problem:** Poor transcription accuracy + +**Solutions:** +- Ensure audio is in correct format (16-bit PCM, 16kHz, mono for streaming) +- Remove background noise +- Use `keywords` parameter for domain-specific terms +- Adjust VAD configuration +- Verify correct language code + +### Token Expiration + +**Problem:** `TOKEN_EXPIRED` errors + +**Solutions:** +- Generate a new token when needed +- Implement automatic token refresh +- Tokens have a 5-minute expiration buffer + +### Concurrency Limits + +**Problem:** `MAX_CONCURRENCY_REACHED` errors + +**Solutions:** +- Close unused sessions using `close_session()` +- Wait for existing sessions to expire +- Upgrade plan for higher limits +- Implement session pooling + +### Microphone Issues + +**Problem:** Microphone not working + +**Solutions:** +- Install with mic extra: `pip install 'aiola[mic]'` +- Check microphone permissions +- List devices with `MicrophoneStream.list_devices()` +- Verify device index is correct +- Check microphone is not in use by another application + +### Import Errors + +**Problem:** `ImportError` when importing SDK + +**Solutions:** +- Ensure Python 3.10+ is installed +- Reinstall package: `pip install --upgrade aiola` +- For mic support: `pip install 'aiola[mic]'` +- Check virtual environment is activated + +### File Upload Errors + +**Problem:** Errors when uploading files + +**Solutions:** +- Verify file format is supported +- Check file exists and is readable +- Ensure file permissions are correct +- Verify file is not corrupted + +### Getting Help + +If you encounter issues not covered here: + +1. Check the [documentation](https://docs.aiola.ai) +2. Review the [examples](./examples) directory +3. Contact Aiola support with: + - Error messages and codes + - SDK version + - Python version + - Minimal code to reproduce the issue + +--- + +## License + +MIT License - see [LICENSE](LICENSE) file for details. diff --git a/aiola/__init__.py b/aiola/__init__.py index 73f96d7..e7673cc 100644 --- a/aiola/__init__.py +++ b/aiola/__init__.py @@ -13,12 +13,14 @@ AiolaValidationError, ) from .mic import MicrophoneStream +from .types import VoiceId __all__ = [ "AiolaClient", "AsyncAiolaClient", "TasksConfig", "MicrophoneStream", + "VoiceId", "AiolaError", "AiolaAuthenticationError", "AiolaConnectionError", diff --git a/aiola/client.py b/aiola/client.py index b9a2609..b3aeacd 100644 --- a/aiola/client.py +++ b/aiola/client.py @@ -9,7 +9,31 @@ class AiolaClient: - """Aiola SDK.""" + """Main client for interacting with the Aiola API (synchronous). + + Provides access to Speech-to-Text (STT), Text-to-Speech (TTS), and authentication services. + The client uses lazy initialization for service clients, creating them only when first accessed. + + You can initialize the client with either an API key (for automatic token management) or + a pre-generated access token. + + Examples: + Using API key (recommended for backend services): + >>> client = AiolaClient(api_key='your-api-key') + >>> transcription = client.stt.transcribe_file('audio.wav') + + Using access token: + >>> token_response = AiolaClient.grant_token(api_key='your-api-key') + >>> client = AiolaClient(access_token=token_response.access_token) + >>> transcription = client.stt.transcribe_file('audio.wav') + + Custom configuration: + >>> client = AiolaClient( + ... api_key='your-api-key', + ... base_url='https://custom-api.aiola.com', + ... timeout=60 + ... ) + """ def __init__( self, @@ -18,9 +42,46 @@ def __init__( access_token: str | None = None, base_url: str | None = None, auth_base_url: str | None = None, - workflow_id: str = DEFAULT_WORKFLOW_ID, - timeout: int = DEFAULT_HTTP_TIMEOUT, + workflow_id: str | None = DEFAULT_WORKFLOW_ID, + timeout: int | None = DEFAULT_HTTP_TIMEOUT, ): + """Initialize the Aiola client. + + Either api_key or access_token must be provided. If both are provided, + access_token takes precedence. + + Args: + api_key: Your Aiola API key. If provided, the client will automatically + manage access tokens. + access_token: A pre-generated access token from grant_token(). Use this + for more control over token lifecycle. + base_url: Optional custom API base URL. Defaults to production endpoint. + auth_base_url: Optional custom authentication base URL. Defaults to + production authentication endpoint. + workflow_id: Optional custom workflow ID. Workflows define the AI + processing pipeline. Defaults to the standard workflow. + timeout: HTTP request timeout in seconds. Defaults to 150 seconds. + + Raises: + AiolaValidationError: If neither api_key nor access_token is provided, + or if parameters are invalid. + AiolaError: If client initialization fails. + + Examples: + >>> # Using API key + >>> client = AiolaClient(api_key='your-api-key') + + >>> # Using access token + >>> token = AiolaClient.grant_token('your-api-key') + >>> client = AiolaClient(access_token=token.access_token) + + >>> # With custom configuration + >>> client = AiolaClient( + ... api_key='your-api-key', + ... base_url='https://custom.aiola.com', + ... timeout=60 + ... ) + """ # Initialize lazy-loaded clients self._stt: SttClient | None = None self._tts: TtsClient | None = None @@ -42,10 +103,36 @@ def __init__( @property def options(self) -> AiolaClientOptions: + """Get the client configuration options. + + Returns the resolved configuration including default values for any + options that were not explicitly provided during construction. + + Returns: + AiolaClientOptions: The client configuration. + """ return self._options @property def stt(self) -> SttClient: + """Get the Speech-to-Text (STT) client. + + Provides access to transcription services including file transcription + and real-time streaming. The client is lazily initialized on first access. + + Returns: + SttClient: The STT client instance. + + Raises: + AiolaError: If STT client initialization fails. + + Examples: + >>> # Transcribe a file + >>> result = client.stt.transcribe_file('audio.wav', language='en') + + >>> # Start a streaming session + >>> stream = client.stt.stream(lang_code='en') + """ if self._stt is None: try: self._stt = SttClient(self._options, self.auth) @@ -55,6 +142,24 @@ def stt(self) -> SttClient: @property def tts(self) -> TtsClient: + """Get the Text-to-Speech (TTS) client. + + Provides access to speech synthesis services. The client is lazily + initialized on first access. + + Returns: + TtsClient: The TTS client instance. + + Raises: + AiolaError: If TTS client initialization fails. + + Examples: + >>> # Synthesize text to speech + >>> audio_data = client.tts.synthesize( + ... text='Hello world', + ... voice_id='en-US-JennyNeural' + ... ) + """ if self._tts is None: try: self._tts = TtsClient(self._options, self.auth) @@ -64,6 +169,17 @@ def tts(self) -> TtsClient: @property def auth(self) -> AuthClient: + """Get the authentication client. + + Used internally for token management and validation. The client is lazily + initialized on first access. Most users will not need to access this directly. + + Returns: + AuthClient: The authentication client instance. + + Raises: + AiolaError: If Auth client initialization fails. + """ if self._auth is None: try: self._auth = AuthClient(options=self._options) @@ -118,7 +234,34 @@ def close_session(access_token: str, auth_base_url: str = DEFAULT_AUTH_BASE_URL) class AsyncAiolaClient: - """Asynchronous Aiola SDK.""" + """Main client for interacting with the Aiola API (asynchronous). + + Provides async/await access to Speech-to-Text (STT), Text-to-Speech (TTS), and + authentication services. Use this client in async applications for better + performance and concurrency. + + The client uses lazy initialization for service clients, creating them only when + first accessed. + + You can initialize the client with either an API key (for automatic token management) + or a pre-generated access token. + + Examples: + Using API key (recommended for backend services): + >>> client = AsyncAiolaClient(api_key='your-api-key') + >>> transcription = await client.stt.transcribe_file('audio.wav') + + Using access token: + >>> token_response = await AsyncAiolaClient.grant_token(api_key='your-api-key') + >>> client = AsyncAiolaClient(access_token=token_response.access_token) + >>> transcription = await client.stt.transcribe_file('audio.wav') + + Custom configuration: + >>> client = AsyncAiolaClient( + ... api_key='your-api-key', + ... base_url='https://custom-api.aiola.com' + ... ) + """ def __init__( self, @@ -129,6 +272,41 @@ def __init__( auth_base_url: str | None = None, workflow_id: str = DEFAULT_WORKFLOW_ID, ): + """Initialize the async Aiola client. + + Either api_key or access_token must be provided. If both are provided, + access_token takes precedence. + + Args: + api_key: Your Aiola API key. If provided, the client will automatically + manage access tokens. + access_token: A pre-generated access token from grant_token(). Use this + for more control over token lifecycle. + base_url: Optional custom API base URL. Defaults to production endpoint. + auth_base_url: Optional custom authentication base URL. Defaults to + production authentication endpoint. + workflow_id: Optional custom workflow ID. Workflows define the AI + processing pipeline. Defaults to the standard workflow. + + Raises: + AiolaValidationError: If neither api_key nor access_token is provided, + or if parameters are invalid. + AiolaError: If client initialization fails. + + Examples: + >>> # Using API key + >>> client = AsyncAiolaClient(api_key='your-api-key') + + >>> # Using access token + >>> token = await AsyncAiolaClient.grant_token('your-api-key') + >>> client = AsyncAiolaClient(access_token=token.access_token) + + >>> # With custom configuration + >>> client = AsyncAiolaClient( + ... api_key='your-api-key', + ... base_url='https://custom.aiola.com' + ... ) + """ # Initialize lazy-loaded clients self._stt: AsyncSttClient | None = None self._tts: AsyncTtsClient | None = None @@ -149,10 +327,36 @@ def __init__( @property def options(self) -> AiolaClientOptions: + """Get the client configuration options. + + Returns the resolved configuration including default values for any + options that were not explicitly provided during construction. + + Returns: + AiolaClientOptions: The client configuration. + """ return self._options @property def stt(self) -> AsyncSttClient: + """Get the Speech-to-Text (STT) client. + + Provides async access to transcription services including file transcription + and real-time streaming. The client is lazily initialized on first access. + + Returns: + AsyncSttClient: The async STT client instance. + + Raises: + AiolaError: If STT client initialization fails. + + Examples: + >>> # Transcribe a file + >>> result = await client.stt.transcribe_file('audio.wav', language='en') + + >>> # Start a streaming session + >>> stream = await client.stt.stream(lang_code='en') + """ if self._stt is None: try: self._stt = AsyncSttClient(self._options, self.auth) @@ -162,6 +366,26 @@ def stt(self) -> AsyncSttClient: @property def tts(self) -> AsyncTtsClient: + """Get the Text-to-Speech (TTS) client. + + Provides async access to speech synthesis services. The client is lazily + initialized on first access. + + Returns: + AsyncTtsClient: The async TTS client instance. + + Raises: + AiolaError: If TTS client initialization fails. + + Examples: + >>> # Synthesize text to speech + >>> async for chunk in client.tts.synthesize( + ... text='Hello world', + ... voice_id='en-US-JennyNeural' + ... ): + ... # Process audio chunk + ... pass + """ if self._tts is None: try: self._tts = AsyncTtsClient(self._options, self.auth) @@ -171,6 +395,17 @@ def tts(self) -> AsyncTtsClient: @property def auth(self) -> AsyncAuthClient: + """Get the authentication client. + + Used internally for token management and validation. The client is lazily + initialized on first access. Most users will not need to access this directly. + + Returns: + AsyncAuthClient: The async authentication client instance. + + Raises: + AiolaError: If Auth client initialization fails. + """ if self._auth is None: try: self._auth = AsyncAuthClient(options=self._options) diff --git a/aiola/clients/stt/client.py b/aiola/clients/stt/client.py index 8c1de0d..d36804d 100644 --- a/aiola/clients/stt/client.py +++ b/aiola/clients/stt/client.py @@ -111,7 +111,11 @@ def _validate_stream_params( class SttClient(_BaseStt): - """STT client.""" + """Speech-to-Text (STT) client for audio transcription services. + + Provides both file-based transcription and real-time streaming capabilities. + Supports multiple audio formats and various AI-powered tasks. + """ def __init__(self, options: AiolaClientOptions, auth: AuthClient) -> None: super().__init__(options, auth) @@ -127,19 +131,50 @@ def stream( tasks_config: TasksConfig | None = None, vad_config: VadConfig | None = None, ) -> StreamConnection: - """Create a streaming connection for real-time transcription. + """Create a real-time streaming connection for audio transcription. + + Returns a connection object that can be used to send audio data and receive + transcription events in real-time. Audio should be sent as 16-bit PCM at + 16kHz sample rate, mono channel. Args: - workflow_id: Workflow ID to use for this stream. If not provided, uses the client's - workflow_id from initialization, or falls back to the default workflow. - execution_id: Unique execution ID. If not provided, a UUID will be generated. - lang_code: Language code for transcription (default: "en"). - time_zone: Time zone for timestamps (default: "UTC"). - keywords: Optional keywords dictionary for enhanced transcription. - tasks_config: Optional configuration for additional AI tasks. + workflow_id: Optional workflow ID. If not provided, uses the client's + workflow_id from initialization, or falls back to the default workflow. + Workflows define the AI processing pipeline. + execution_id: Optional execution ID for tracking this session. If not + provided, a UUID will be automatically generated. Useful for + correlating logs and events. + lang_code: Optional language code for transcription (e.g., 'en', 'es', 'fr'). + If not specified, the service will attempt to auto-detect. + time_zone: Optional timezone for timestamps in events. Defaults to 'UTC'. + Use IANA timezone format (e.g., 'America/New_York', 'Europe/London'). + keywords: Optional dictionary mapping spoken phrases to written forms + for boosting recognition accuracy. Format: {'spoken': 'written'}. + tasks_config: Optional AI tasks configuration. Specify which AI-powered + analysis tasks should run (e.g., translation, sentiment analysis). + vad_config: Optional Voice Activity Detection configuration. Controls + how speech segments are detected and when events are emitted. Returns: StreamConnection: A connection object for real-time streaming. + + Raises: + AiolaValidationError: If parameters are invalid. + AiolaError: If connection creation fails. + + Examples: + >>> # Basic streaming + >>> stream = client.stt.stream(lang_code='en') + >>> stream.on('transcript', lambda data: print(data['transcript'])) + >>> stream.connect() + >>> stream.send(audio_data) + + >>> # With keywords and VAD config + >>> stream = client.stt.stream( + ... lang_code='en', + ... keywords={'aiola': 'Aiola', 'AI': 'AI'}, + ... vad_config={'min_speech_ms': 300, 'min_silence_ms': 700} + ... ) """ try: self._validate_stream_params( @@ -179,7 +214,55 @@ def transcribe_file( keywords: dict[str, str] | None = None, vad_config: VadConfig | None = None, ) -> TranscriptionResponse: - """Transcribe an audio file and return the transcription result.""" + """Transcribe an audio file to text. + + Uploads and processes an audio file on the server, returning the complete + transcription once processing is finished. Supports multiple audio formats + including WAV, MP3, M4A, OGG, and FLAC. + + Args: + file: Audio file to transcribe. Can be a file path (str), file object, + or bytes. Supported formats: WAV, MP3, M4A, OGG, FLAC. + language: Optional language code (e.g., 'en', 'es', 'fr'). If not + specified, the service will attempt to auto-detect the language. + keywords: Optional dictionary mapping spoken phrases to written forms + for boosting recognition accuracy. Format: {'spoken': 'written'}. + vad_config: Optional Voice Activity Detection configuration. Controls + how speech segments are detected in the audio. + + Returns: + TranscriptionResponse: Complete transcription result including: + - transcript: The processed transcription text + - raw_transcript: Unprocessed transcription + - segments: Time segments where speech was detected + - metadata: File metadata and transcription information + + Raises: + AiolaFileError: If file parameter is missing or invalid. + AiolaValidationError: If parameters are invalid. + AiolaAuthenticationError: If authentication fails (401). + AiolaServerError: If server error occurs (500+). + AiolaConnectionError: If network error occurs. + AiolaError: For other transcription errors. + + Examples: + >>> # Transcribe from file path + >>> result = client.stt.transcribe_file('audio.wav', language='en') + >>> print(result.transcript) + + >>> # Transcribe with keywords + >>> result = client.stt.transcribe_file( + ... 'audio.wav', + ... language='en', + ... keywords={'aiola': 'Aiola', 'API': 'API'} + ... ) + + >>> # Transcribe with VAD config + >>> result = client.stt.transcribe_file( + ... 'audio.wav', + ... vad_config={'min_speech_ms': 300, 'threshold': 0.6} + ... ) + """ if file is None: raise AiolaFileError("File parameter is required") @@ -229,7 +312,11 @@ def transcribe_file( class AsyncSttClient(_BaseStt): - """Asynchronous STT client.""" + """Asynchronous Speech-to-Text (STT) client for audio transcription. + + Provides async/await access to file-based transcription and real-time streaming. + Use this client in async applications for better performance and concurrency. + """ def __init__(self, options: AiolaClientOptions, auth: AsyncAuthClient) -> None: super().__init__(options, auth) @@ -245,19 +332,41 @@ async def stream( tasks_config: TasksConfig | None = None, vad_config: VadConfig | None = None, ) -> AsyncStreamConnection: - """Create an async streaming connection for real-time transcription. + """Create a real-time async streaming connection for audio transcription. + + Returns a connection object that can be used to send audio data and receive + transcription events in real-time using async/await. Audio should be sent as + 16-bit PCM at 16kHz sample rate, mono channel. Args: - workflow_id: Workflow ID to use for this stream. If not provided, uses the client's - workflow_id from initialization, or falls back to the default workflow. - execution_id: Unique execution ID. If not provided, a UUID will be generated. - lang_code: Language code for transcription (default: "en"). - time_zone: Time zone for timestamps (default: "UTC"). - keywords: Optional keywords dictionary for enhanced transcription. - tasks_config: Optional configuration for additional AI tasks. + workflow_id: Optional workflow ID. If not provided, uses the client's + workflow_id from initialization, or falls back to the default workflow. + execution_id: Optional execution ID for tracking. Auto-generated if not provided. + lang_code: Optional language code (e.g., 'en', 'es', 'fr'). + time_zone: Optional timezone for timestamps (default: 'UTC'). + keywords: Optional keywords dictionary for boosting recognition. + tasks_config: Optional AI tasks configuration. + vad_config: Optional Voice Activity Detection configuration. Returns: - AsyncStreamConnection: A connection object for real-time async streaming. + AsyncStreamConnection: An async connection object for real-time streaming. + + Raises: + AiolaValidationError: If parameters are invalid. + AiolaError: If connection creation fails. + + Examples: + >>> # Basic async streaming + >>> stream = await client.stt.stream(lang_code='en') + >>> stream.on('transcript', lambda data: print(data['transcript'])) + >>> await stream.connect() + >>> await stream.send(audio_data) + + >>> # With AI tasks + >>> stream = await client.stt.stream( + ... lang_code='en', + ... tasks_config={'TRANSLATION': {'src_lang_code': 'en', 'dst_lang_code': 'es'}} + ... ) """ try: self._validate_stream_params( @@ -297,7 +406,42 @@ async def transcribe_file( keywords: dict[str, str] | None = None, vad_config: VadConfig | None = None, ) -> TranscriptionResponse: - """Transcribe an audio file and return the transcription result.""" + """Asynchronously transcribe an audio file to text. + + Uploads and processes an audio file on the server, returning the complete + transcription once processing is finished. Supports multiple audio formats. + + Args: + file: Audio file to transcribe. Can be a file path, file object, or bytes. + Supported formats: WAV, MP3, M4A, OGG, FLAC. + language: Optional language code (e.g., 'en', 'es', 'fr'). + keywords: Optional keywords dictionary for boosting recognition. + vad_config: Optional Voice Activity Detection configuration. + + Returns: + TranscriptionResponse: Complete transcription result with text, segments, + and metadata. + + Raises: + AiolaFileError: If file parameter is missing or invalid. + AiolaValidationError: If parameters are invalid. + AiolaAuthenticationError: If authentication fails. + AiolaServerError: If server error occurs. + AiolaConnectionError: If network error occurs. + AiolaError: For other transcription errors. + + Examples: + >>> # Async transcription + >>> result = await client.stt.transcribe_file('audio.wav', language='en') + >>> print(result.transcript) + + >>> # With keywords + >>> result = await client.stt.transcribe_file( + ... 'audio.wav', + ... language='en', + ... keywords={'company': 'CompanyName'} + ... ) + """ if file is None: raise AiolaFileError("File parameter is required") diff --git a/aiola/clients/stt/stream_client.py b/aiola/clients/stt/stream_client.py index 66f48fd..0e01330 100644 --- a/aiola/clients/stt/stream_client.py +++ b/aiola/clients/stt/stream_client.py @@ -9,7 +9,33 @@ class StreamConnection: - """Stream connection for the STT client.""" + """Real-time audio streaming connection for transcription. + + Manages a Socket.IO connection for bidirectional communication with the Aiola + streaming service. Handles automatic reconnection and provides event-based + communication for real-time transcription. + + Audio must be sent as 16-bit PCM format at 16kHz sample rate, mono channel. + + Examples: + >>> # Create and use a streaming connection + >>> stream = client.stt.stream(lang_code='en') + >>> + >>> # Register event handlers + >>> @stream.on(LiveEvents.Transcript) + >>> def on_transcript(data): + >>> print(f"Transcript: {data['transcript']}") + >>> + >>> # Or using direct registration + >>> stream.on(LiveEvents.Connect, lambda: print("Connected")) + >>> + >>> # Connect and start streaming + >>> stream.connect() + >>> stream.send(audio_data) + >>> + >>> # Clean up + >>> stream.disconnect() + """ def __init__( self, @@ -32,7 +58,21 @@ def __init__( ) def connect(self) -> None: - """Establish the socket connection using stored parameters.""" + """Establish the Socket.IO connection to the streaming service. + + Creates a WebSocket connection for real-time audio streaming. The connection + automatically uses the URL, headers, and parameters configured during + initialization. Supports automatic reconnection up to 3 attempts. + + If already connected, this method returns without creating a new connection. + + Raises: + AiolaStreamingError: If connection fails. + + Examples: + >>> stream.on(LiveEvents.Connect, lambda: print("Connected!")) + >>> stream.connect() + """ if self._sio.connected: return # Already connected @@ -75,7 +115,28 @@ def decorator(func: Callable[..., Any]) -> Callable[..., Any]: raise AiolaStreamingError(f"Failed to register event handler for '{event}'") from exc def send(self, data: bytes) -> None: - """Send binary audio data.""" + """Send audio data to the streaming service. + + Audio must be in 16-bit PCM format at 16kHz sample rate, mono channel. + Send audio in chunks as it becomes available (typically 100-1000ms chunks). + + Args: + data: Audio data in bytes (16-bit PCM, 16kHz, mono). + + Raises: + AiolaError: If connection is not established. + AiolaValidationError: If data is not bytes. + AiolaStreamingError: If sending data fails. + + Examples: + >>> # Send audio chunk + >>> stream.send(audio_bytes) + + >>> # Stream from microphone + >>> while recording: + >>> chunk = microphone.read() + >>> stream.send(chunk) + """ if not self.connected: raise AiolaError("Connection not established") @@ -88,7 +149,26 @@ def send(self, data: bytes) -> None: raise AiolaStreamingError("Failed to send audio data") from exc def set_keywords(self, keywords: dict[str, str]) -> None: - """Send keywords list to the server.""" + """Set or update keywords for recognition boosting. + + Keywords are used to improve recognition accuracy for specific terms or phrases. + The dictionary maps spoken phrases to their written forms. + + Args: + keywords: Dictionary mapping spoken phrases to written forms. + Example: {'eye ola': 'Aiola', 'open AI': 'OpenAI'} + + Raises: + AiolaValidationError: If keywords is not a dict or values are not strings. + AiolaStreamingError: If sending keywords fails. + + Examples: + >>> stream.set_keywords({ + >>> 'aiola': 'Aiola', + >>> 'API': 'API', + >>> 'docker': 'Docker' + >>> }) + """ if not isinstance(keywords, dict): raise AiolaValidationError("Keywords must be a dict") @@ -101,7 +181,18 @@ def set_keywords(self, keywords: dict[str, str]) -> None: raise AiolaStreamingError("Failed to send keywords") from exc def disconnect(self) -> None: - """Disconnect the socket connection.""" + """Close the Socket.IO connection to the streaming service. + + Gracefully terminates the connection. Always disconnect when finished to + free up server resources. If not connected, this method returns without error. + + Raises: + AiolaStreamingError: If disconnection fails. + + Examples: + >>> stream.disconnect() + >>> print("Disconnected from streaming service") + """ if self._sio.connected: try: self._sio.disconnect() @@ -115,7 +206,30 @@ def connected(self) -> bool: class AsyncStreamConnection: - """Async stream connection for the STT client.""" + """Asynchronous real-time audio streaming connection for transcription. + + Manages an async Socket.IO connection for bidirectional communication with the + Aiola streaming service. Use this in async applications for better performance + and concurrency. + + Audio must be sent as 16-bit PCM format at 16kHz sample rate, mono channel. + + Examples: + >>> # Create and use an async streaming connection + >>> stream = await client.stt.stream(lang_code='en') + >>> + >>> # Register event handlers + >>> @stream.on(LiveEvents.Transcript) + >>> def on_transcript(data): + >>> print(f"Transcript: {data['transcript']}") + >>> + >>> # Connect and start streaming + >>> await stream.connect() + >>> await stream.send(audio_data) + >>> + >>> # Clean up + >>> await stream.disconnect() + """ def __init__( self, @@ -138,7 +252,18 @@ def __init__( ) async def connect(self) -> None: - """Establish the socket connection using stored parameters.""" + """Asynchronously establish the Socket.IO connection. + + Creates a WebSocket connection for real-time audio streaming using async/await. + Supports automatic reconnection up to 3 attempts. + + Raises: + AiolaStreamingError: If connection fails. + + Examples: + >>> stream.on(LiveEvents.Connect, lambda: print("Connected!")) + >>> await stream.connect() + """ if self._sio.connected: return # Already connected @@ -155,7 +280,31 @@ async def connect(self) -> None: raise AiolaStreamingError("Failed to connect to Streaming service") from exc def on(self, event: LiveEvents, handler: Callable[..., Any] | None = None) -> Callable[..., Any]: - """Register an event handler.""" + """Register an event handler for async streaming events. + + Can be used as a decorator or called directly. Handlers can be regular + functions or async functions. + + Args: + event: The event to listen for (from LiveEvents enum). + handler: Optional event handler function. + + Returns: + The registered handler function (or decorator if handler is None). + + Raises: + AiolaValidationError: If event or handler is invalid. + AiolaStreamingError: If registration fails. + + Examples: + >>> # Decorator usage + >>> @stream.on(LiveEvents.Transcript) + >>> async def handle_transcript(data): + >>> await process_transcript(data['transcript']) + + >>> # Direct usage + >>> stream.on(LiveEvents.Connect, lambda: print("Connected")) + """ if not isinstance(event, LiveEvents) or not event: raise AiolaValidationError("Event name must be a non-empty string") @@ -181,7 +330,21 @@ def decorator(func: Callable[..., Any]) -> Callable[..., Any]: raise AiolaStreamingError(f"Failed to register event handler for '{event}'") from exc async def send(self, data: bytes) -> None: - """Send binary audio data.""" + """Asynchronously send audio data to the streaming service. + + Audio must be in 16-bit PCM format at 16kHz sample rate, mono channel. + + Args: + data: Audio data in bytes (16-bit PCM, 16kHz, mono). + + Raises: + AiolaError: If connection is not established. + AiolaValidationError: If data is not bytes. + AiolaStreamingError: If sending data fails. + + Examples: + >>> await stream.send(audio_bytes) + """ if not self.connected: raise AiolaError("Connection not established") @@ -194,7 +357,18 @@ async def send(self, data: bytes) -> None: raise AiolaStreamingError("Failed to send audio data") from exc async def set_keywords(self, keywords: dict[str, str]) -> None: - """Send keywords list to the server.""" + """Asynchronously set or update keywords for recognition boosting. + + Args: + keywords: Dictionary mapping spoken phrases to written forms. + + Raises: + AiolaValidationError: If keywords format is invalid. + AiolaStreamingError: If sending keywords fails. + + Examples: + >>> await stream.set_keywords({'aiola': 'Aiola', 'API': 'API'}) + """ if not isinstance(keywords, dict): raise AiolaValidationError("Keywords must be a dict") @@ -207,7 +381,16 @@ async def set_keywords(self, keywords: dict[str, str]) -> None: raise AiolaStreamingError("Failed to send keywords") from exc async def disconnect(self) -> None: - """Disconnect the socket connection.""" + """Asynchronously close the Socket.IO connection. + + Gracefully terminates the connection using async/await. + + Raises: + AiolaStreamingError: If disconnection fails. + + Examples: + >>> await stream.disconnect() + """ if self._sio.connected: try: await self._sio.disconnect() diff --git a/aiola/clients/tts/client.py b/aiola/clients/tts/client.py index 91bd54c..aff60ec 100644 --- a/aiola/clients/tts/client.py +++ b/aiola/clients/tts/client.py @@ -7,7 +7,7 @@ from ...errors import AiolaAuthenticationError, AiolaConnectionError, AiolaError, AiolaServerError, AiolaValidationError from ...http_client import create_async_authenticated_client, create_authenticated_client -from ...types import AiolaClientOptions +from ...types import AiolaClientOptions, VoiceId if TYPE_CHECKING: from ...clients.auth.client import AsyncAuthClient, AuthClient @@ -22,23 +22,70 @@ def __init__(self, options: AiolaClientOptions, auth: AuthClient | AsyncAuthClie def _make_headers() -> dict[str, str]: return {"Accept": "audio/*"} - def _validate_tts_params(self, text: str, voice_id: str) -> None: + def _validate_tts_params(self, text: str, voice_id: VoiceId | str) -> None: """Validate TTS parameters.""" if not text or not isinstance(text, str): raise AiolaValidationError("text must be a non-empty string") + # VoiceId enum inherits from str, so isinstance check works for both if not voice_id or not isinstance(voice_id, str): raise AiolaValidationError("voice_id must be a non-empty string") class TtsClient(BaseTts): - """TTS client.""" + """Text-to-Speech (TTS) client for converting text to spoken audio. + + Provides methods for both streaming and complete synthesis of text to speech. + Audio is returned as an iterator of bytes for efficient handling of large audio files. + """ def __init__(self, options: AiolaClientOptions, auth: AuthClient): super().__init__(options, auth) self._auth: AuthClient = auth # Type narrowing - def stream(self, *, text: str, voice_id: str) -> Iterator[bytes]: - """Stream synthesized audio in real-time.""" + def stream(self, *, text: str, voice_id: VoiceId | str) -> Iterator[bytes]: + """Synthesize text to speech with streaming delivery. + + Returns audio as it's generated, providing lower latency than the synthesize + method. Ideal for real-time applications where you want to start playing audio + as soon as the first chunks are available. + + Args: + text: The text to convert to speech. Can include punctuation and + formatting which may affect prosody and pauses. + voice_id: Voice identifier for synthesis. Supported voices: + - 'en_us_female', 'en_us_male' - English (US) + - 'es_female', 'es_male' - Spanish + - 'fr_female', 'fr_male' - French + - 'de_female', 'de_male' - German + - 'ja_female', 'ja_male' - Japanese + - 'pt_female', 'pt_male' - Portuguese + + Yields: + bytes: Audio data chunks as they are generated. + + Raises: + AiolaValidationError: If text or voice_id is invalid. + AiolaAuthenticationError: If authentication fails (401). + AiolaServerError: If server error occurs (500+). + AiolaConnectionError: If network error occurs. + AiolaError: For other synthesis errors. + + Examples: + >>> # Stream synthesis for real-time playback + >>> for chunk in client.tts.stream( + ... text='Hello, welcome to Aiola!', + ... voice_id='en-US-JennyNeural' + ... ): + ... audio_player.play(chunk) + + >>> # Save streamed audio to file + >>> with open('output.wav', 'wb') as f: + ... for chunk in client.tts.stream( + ... text='This is a test', + ... voice_id='en-GB-RyanNeural' + ... ): + ... f.write(chunk) + """ self._validate_tts_params(text, voice_id) try: @@ -77,8 +124,48 @@ def stream(self, *, text: str, voice_id: str) -> Iterator[bytes]: except Exception as exc: raise AiolaError(f"TTS streaming failed: {str(exc)}") from exc - def synthesize(self, *, text: str, voice_id: str) -> Iterator[bytes]: - """Synthesize audio and return as iterator of bytes.""" + def synthesize(self, *, text: str, voice_id: VoiceId | str) -> Iterator[bytes]: + """Synthesize text to speech with complete generation. + + Waits for the complete audio to be generated before starting to return data. + Use this when you need the complete audio file, or when you want to ensure + the entire synthesis is successful before proceeding. + + Args: + text: The text to convert to speech. + voice_id: Voice identifier for synthesis. Supported voices: + - 'en_us_female', 'en_us_male' - English (US) + - 'es_female', 'es_male' - Spanish + - 'fr_female', 'fr_male' - French + - 'de_female', 'de_male' - German + - 'ja_female', 'ja_male' - Japanese + - 'pt_female', 'pt_male' - Portuguese + + Yields: + bytes: Audio data chunks. + + Raises: + AiolaValidationError: If text or voice_id is invalid. + AiolaAuthenticationError: If authentication fails. + AiolaServerError: If server error occurs. + AiolaConnectionError: If network error occurs. + AiolaError: For other synthesis errors. + + Examples: + >>> # Synthesize complete audio + >>> chunks = [] + >>> for chunk in client.tts.synthesize( + ... text='This is a longer text that will be fully synthesized.', + ... voice_id='en-US-JennyNeural' + ... ): + ... chunks.append(chunk) + >>> audio_data = b''.join(chunks) + + >>> # Save to file + >>> with open('output.wav', 'wb') as f: + ... for chunk in client.tts.synthesize(text='Hello', voice_id='en-US-JennyNeural'): + ... f.write(chunk) + """ self._validate_tts_params(text, voice_id) try: @@ -119,14 +206,58 @@ def synthesize(self, *, text: str, voice_id: str) -> Iterator[bytes]: class AsyncTtsClient(BaseTts): - """Asynchronous TTS client.""" + """Asynchronous Text-to-Speech (TTS) client for converting text to audio. + + Provides async/await access to speech synthesis services. Use this client in + async applications for better performance and concurrency. + """ def __init__(self, options: AiolaClientOptions, auth: AsyncAuthClient): super().__init__(options, auth) self._auth: AsyncAuthClient = auth # Type narrowing - async def stream(self, *, text: str, voice_id: str) -> AsyncIterator[bytes]: - """Stream synthesized audio in real-time (async).""" + async def stream(self, *, text: str, voice_id: VoiceId | str) -> AsyncIterator[bytes]: + """Asynchronously synthesize text to speech with streaming delivery. + + Returns audio as it's generated using async iteration. Provides lower latency + for real-time applications. + + Args: + text: The text to convert to speech. + voice_id: Voice identifier for synthesis. Supported voices: + - 'en_us_female', 'en_us_male' - English (US) + - 'es_female', 'es_male' - Spanish + - 'fr_female', 'fr_male' - French + - 'de_female', 'de_male' - German + - 'ja_female', 'ja_male' - Japanese + - 'pt_female', 'pt_male' - Portuguese + + Yields: + bytes: Audio data chunks as they are generated. + + Raises: + AiolaValidationError: If parameters are invalid. + AiolaAuthenticationError: If authentication fails. + AiolaServerError: If server error occurs. + AiolaConnectionError: If network error occurs. + AiolaError: For other synthesis errors. + + Examples: + >>> # Async stream synthesis + >>> async for chunk in await client.tts.stream( + ... text='Hello, world!', + ... voice_id='en-US-JennyNeural' + ... ): + ... await audio_player.play(chunk) + + >>> # Save to file asynchronously + >>> async with aiofiles.open('output.wav', 'wb') as f: + ... async for chunk in await client.tts.stream( + ... text='Test audio', + ... voice_id='en-GB-RyanNeural' + ... ): + ... await f.write(chunk) + """ self._validate_tts_params(text, voice_id) try: @@ -167,8 +298,41 @@ async def stream(self, *, text: str, voice_id: str) -> AsyncIterator[bytes]: except Exception as exc: raise AiolaError(f"Async TTS streaming failed: {str(exc)}") from exc - async def synthesize(self, *, text: str, voice_id: str) -> AsyncIterator[bytes]: - """Synthesize audio and return as async iterator of bytes.""" + async def synthesize(self, *, text: str, voice_id: VoiceId | str) -> AsyncIterator[bytes]: + """Asynchronously synthesize text to speech with complete generation. + + Waits for complete audio generation before streaming using async iteration. + + Args: + text: The text to convert to speech. + voice_id: Voice identifier for synthesis. Supported voices: + - 'en_us_female', 'en_us_male' - English (US) + - 'es_female', 'es_male' - Spanish + - 'fr_female', 'fr_male' - French + - 'de_female', 'de_male' - German + - 'ja_female', 'ja_male' - Japanese + - 'pt_female', 'pt_male' - Portuguese + + Yields: + bytes: Audio data chunks. + + Raises: + AiolaValidationError: If parameters are invalid. + AiolaAuthenticationError: If authentication fails. + AiolaServerError: If server error occurs. + AiolaConnectionError: If network error occurs. + AiolaError: For other synthesis errors. + + Examples: + >>> # Async synthesis + >>> chunks = [] + >>> async for chunk in await client.tts.synthesize( + ... text='Complete synthesis example', + ... voice_id='en-US-JennyNeural' + ... ): + ... chunks.append(chunk) + >>> audio_data = b''.join(chunks) + """ self._validate_tts_params(text, voice_id) try: diff --git a/aiola/errors.py b/aiola/errors.py index 193e499..4a40115 100644 --- a/aiola/errors.py +++ b/aiola/errors.py @@ -10,6 +10,34 @@ class AiolaError(Exception): All errors thrown by this SDK inherit from :class:`AiolaError` so that callers can rely on a single error type for predictable error handling. + + Attributes: + message: Human-readable description of the error. + reason: Detailed explanation from the server (if available). + status: HTTP status code when the error originates from an HTTP response. + code: Machine-readable error code for programmatic handling. + details: Additional diagnostic information that may help debug the problem. + + Common error codes: + - TOKEN_EXPIRED: Access token has expired and needs to be refreshed + - INVALID_TOKEN: Access token is malformed or invalid + - MAX_CONCURRENCY_REACHED: Too many concurrent sessions for your account + - INVALID_AUDIO_FORMAT: The audio format is not supported + - RATE_LIMIT_EXCEEDED: Too many requests in a short period + - WORKFLOW_NOT_FOUND: The specified workflow ID does not exist + - UNAUTHORIZED: Invalid API key or insufficient permissions + - VALIDATION_ERROR: Request parameters are invalid + + Examples: + >>> try: + ... result = client.stt.transcribe_file('audio.wav') + ... except AiolaError as e: + ... print(f"Error: {e.message}") + ... print(f"Code: {e.code}") + ... print(f"Status: {e.status}") + ... if e.code == 'TOKEN_EXPIRED': + ... # Refresh token and retry + ... pass """ def __init__( @@ -66,42 +94,242 @@ def __str__(self) -> str: class AiolaConnectionError(AiolaError): - """Raised when there are connectivity issues with the Aiola API.""" + """Raised when there are connectivity issues with the Aiola API. + + This error indicates network-level problems such as: + - DNS resolution failures + - Connection timeouts + - Network unreachable errors + - SSL/TLS handshake failures + + When raised: + - During any HTTP request if network connection fails + - When the API endpoint is unreachable + - When network errors occur during file upload or streaming + + How to handle: + - Check network connectivity + - Verify firewall settings + - Retry with exponential backoff + - Check if the API endpoint URL is correct + """ pass class AiolaAuthenticationError(AiolaError): - """Raised when authentication fails (invalid API key, expired token, etc.).""" + """Raised when authentication fails. + + This error indicates authentication-related issues such as: + - Invalid API key + - Expired access token + - Malformed access token + - Missing authentication credentials + - Unauthorized access to resources + + When raised: + - When API key is invalid or missing + - When access token has expired (typically after some time) + - When access token is malformed or corrupted + - When trying to access resources without proper permissions + - HTTP 401 responses from the API + + How to handle: + - Verify your API key is correct + - Generate a new access token using grant_token() + - Check that you're using a valid, non-expired token + - Ensure your account has the necessary permissions + + Examples: + >>> try: + ... result = client.stt.transcribe_file('audio.wav') + ... except AiolaAuthenticationError as e: + ... if e.code == 'TOKEN_EXPIRED': + ... # Generate new token + ... token = AiolaClient.grant_token(api_key='your-key') + ... new_client = AiolaClient(access_token=token.access_token) + ... result = new_client.stt.transcribe_file('audio.wav') + """ pass class AiolaValidationError(AiolaError): - """Raised when input validation fails (invalid parameters, missing required fields, etc.).""" + """Raised when input validation fails. + + This error indicates problems with the parameters provided to SDK methods: + - Missing required parameters + - Invalid parameter types + - Invalid parameter values + - Parameters out of acceptable range + + When raised: + - When required parameters are missing (e.g., file, text, api_key) + - When parameter types are incorrect (e.g., string expected but int provided) + - When parameter values are invalid (e.g., empty string, negative numbers) + - Before making API requests (client-side validation) + - When API returns 400 Bad Request for invalid input + + How to handle: + - Check the error message for specific validation failure + - Verify all required parameters are provided + - Ensure parameter types match expected types + - Validate parameter values are within acceptable ranges + + Examples: + >>> try: + ... stream = client.stt.stream(lang_code=123) # Should be string + ... except AiolaValidationError as e: + ... print(f"Validation error: {e.message}") + ... # Fix: stream = client.stt.stream(lang_code='en') + """ pass class AiolaStreamingError(AiolaError): - """Raised when streaming operations fail (WebSocket connection issues, etc.).""" + """Raised when streaming operations fail. + + This error indicates issues specific to real-time streaming: + - WebSocket connection failures + - Socket.IO connection issues + - Connection drops during streaming + - Failures to send or receive streaming data + - Event handler registration failures + + When raised: + - When failing to establish WebSocket connection + - When connection drops unexpectedly during streaming + - When unable to send audio data to the streaming service + - When event handler registration fails + - When disconnection fails + + How to handle: + - Check network stability + - Verify WebSocket connections are not blocked by firewall + - Implement reconnection logic with exponential backoff + - Listen to disconnect events and handle reconnection + - Validate audio format meets requirements (16-bit PCM, 16kHz, mono) + + Examples: + >>> try: + ... stream = client.stt.stream(lang_code='en') + ... stream.connect() + ... stream.send(audio_data) + ... except AiolaStreamingError as e: + ... print(f"Streaming error: {e.message}") + ... # Attempt reconnection + ... stream.disconnect() + ... stream.connect() + """ pass class AiolaFileError(AiolaError): - """Raised when file operations fail (invalid file format, file too large, etc.).""" + """Raised when file operations fail. + + This error indicates issues with file handling: + - Invalid or unsupported file format + - File not found + - File too large + - File read/write errors + - Corrupted file + + When raised: + - When file parameter is None or missing + - When file format is not supported (must be WAV, MP3, M4A, OGG, or FLAC) + - When file path doesn't exist or is inaccessible + - When file exceeds size limits + - When file is corrupted or unreadable + + How to handle: + - Verify file exists and is accessible + - Check file format is supported + - Ensure file is not corrupted + - Check file size is within limits + - Verify file permissions for reading + + Examples: + >>> try: + ... result = client.stt.transcribe_file(None) + ... except AiolaFileError as e: + ... print(f"File error: {e.message}") + ... # Fix: Provide valid file + ... result = client.stt.transcribe_file('audio.wav') + """ pass class AiolaRateLimitError(AiolaError): - """Raised when API rate limits are exceeded.""" + """Raised when API rate limits are exceeded. + + This error indicates you've made too many requests in a short period: + - Too many requests per second/minute/hour + - Concurrent request limit exceeded + - Quota exhausted + + When raised: + - When HTTP 429 (Too Many Requests) is returned + - When making requests too rapidly + - When exceeding concurrent session limits + - When account quota is exhausted + + How to handle: + - Implement exponential backoff and retry logic + - Reduce request rate + - Close unused sessions + - Check rate limit headers if available + - Consider upgrading plan for higher limits + + Examples: + >>> import time + >>> try: + ... for i in range(100): + ... result = client.stt.transcribe_file(f'audio{i}.wav') + ... except AiolaRateLimitError as e: + ... print(f"Rate limit exceeded: {e.message}") + ... time.sleep(60) # Wait before retrying + """ pass class AiolaServerError(AiolaError): - """Raised when the Aiola API returns a server error (5xx status codes).""" + """Raised when the Aiola API returns a server error (5xx status codes). + + This error indicates issues on the server side: + - Internal server errors (500) + - Service unavailable (503) + - Gateway timeout (504) + - Other server-side problems + + When raised: + - When HTTP 500 (Internal Server Error) is returned + - When HTTP 503 (Service Unavailable) is returned + - When HTTP 504 (Gateway Timeout) is returned + - When any other 5xx status code is returned + + How to handle: + - Retry the request after a delay + - Implement exponential backoff + - Check service status page if available + - Contact support if error persists + - These are typically temporary issues + + Examples: + >>> import time + >>> max_retries = 3 + >>> for attempt in range(max_retries): + ... try: + ... result = client.stt.transcribe_file('audio.wav') + ... break + ... except AiolaServerError as e: + ... if attempt < max_retries - 1: + ... time.sleep(2 ** attempt) # Exponential backoff + ... else: + ... raise + """ pass diff --git a/aiola/types.py b/aiola/types.py index 35377df..2f9a688 100644 --- a/aiola/types.py +++ b/aiola/types.py @@ -10,13 +10,29 @@ @dataclass class AiolaClientOptions: - """Configuration options for Aiola clients.""" + """Configuration options for Aiola clients. + + Contains all configuration parameters needed to initialize and configure + an Aiola client. Either api_key or access_token must be provided. + + Attributes: + base_url: API base URL. Defaults to production Aiola API endpoint. + auth_base_url: Authentication service base URL. Defaults to production + authentication endpoint. + api_key: Your Aiola API key. Used for automatic token management. Either + this or access_token must be provided. + access_token: Pre-generated access token from grant_token(). Either this + or api_key must be provided. + workflow_id: Workflow ID defining the AI processing pipeline. Defaults + to the standard workflow. + timeout: HTTP request timeout in seconds. Defaults to 150 seconds. + """ base_url: str | None = DEFAULT_BASE_URL auth_base_url: str | None = DEFAULT_AUTH_BASE_URL api_key: str | None = None access_token: str | None = None - workflow_id: str = DEFAULT_WORKFLOW_ID + workflow_id: str | None = DEFAULT_WORKFLOW_ID timeout: float | None = DEFAULT_HTTP_TIMEOUT def __post_init__(self) -> None: @@ -44,6 +60,22 @@ def __post_init__(self) -> None: class LiveEvents(str, enum.Enum): + """Events that can be received during live audio streaming. + + These events provide real-time feedback and results from the streaming + transcription service. + + Attributes: + Transcript: Real-time transcription results emitted for each detected + speech segment. + Translation: Translation results (only if TRANSLATION task is enabled). + Structured: Structured data extraction results (only if FORM_FILLING + task is enabled). + Error: Error events indicating issues during streaming. + Disconnect: Connection closed event. + Connect: Connection established event. + """ + Transcript = "transcript" Translation = "translation" Structured = "structured" @@ -52,15 +84,79 @@ class LiveEvents(str, enum.Enum): Connect = "connect" +class VoiceId(str, enum.Enum): + """Supported voice identifiers for text-to-speech synthesis. + + Use these enum values for type safety and autocomplete when specifying voices. + + Attributes: + EnglishUSFemale: English (US) - Female voice ('en_us_female') + EnglishUSMale: English (US) - Male voice ('en_us_male') + SpanishFemale: Spanish - Female voice ('es_female') + SpanishMale: Spanish - Male voice ('es_male') + FrenchFemale: French - Female voice ('fr_female') + FrenchMale: French - Male voice ('fr_male') + GermanFemale: German - Female voice ('de_female') + GermanMale: German - Male voice ('de_male') + JapaneseFemale: Japanese - Female voice ('ja_female') + JapaneseMale: Japanese - Male voice ('ja_male') + PortugueseFemale: Portuguese - Female voice ('pt_female') + PortugueseMale: Portuguese - Male voice ('pt_male') + + Examples: + >>> from aiola import AiolaClient, VoiceId + >>> client = AiolaClient(access_token='your-token') + >>> audio = client.tts.synthesize( + ... text='Hello, world!', + ... voice_id=VoiceId.EnglishUSFemale + ... ) + """ + + EnglishUSFemale = "en_us_female" + EnglishUSMale = "en_us_male" + SpanishFemale = "es_female" + SpanishMale = "es_male" + FrenchFemale = "fr_female" + FrenchMale = "fr_male" + GermanFemale = "de_female" + GermanMale = "de_male" + JapaneseFemale = "ja_female" + JapaneseMale = "ja_male" + PortugueseFemale = "pt_female" + PortugueseMale = "pt_male" + + @dataclass class Segment: + """Time segment representing a portion of audio. + + Indicates where speech was detected in the audio file. + + Attributes: + start: Start time of the segment in seconds. + end: End time of the segment in seconds. + """ + start: float end: float @dataclass class TranscriptionMetadata: - """Metadata for transcription results.""" + """Metadata about the transcribed audio file and transcription process. + + Contains information about the audio file characteristics and transcription results. + + Attributes: + file_duration: Total duration of the audio file in seconds. + language: Detected or specified language code (e.g., 'en', 'es', 'fr'). + sample_rate: Sample rate of the audio file in Hz (e.g., 16000, 44100). + num_channels: Number of audio channels (1 for mono, 2 for stereo). + timestamp_utc: ISO 8601 timestamp when transcription was processed. + segments_count: Number of speech segments detected in the audio. + total_speech_duration: Total duration of detected speech in seconds + (excludes silence). + """ file_duration: float | None = None language: str | None = None @@ -82,7 +178,20 @@ def from_dict(cls, data: dict) -> TranscriptionMetadata: @dataclass class TranscriptionResponse: - """Response from file transcription API.""" + """Response from the file transcription API. + + Contains the complete transcription results including processed text, + time segments, and metadata. + + Attributes: + transcript: The complete transcription text with formatting and punctuation. + This is the processed, production-ready transcript. + raw_transcript: The raw transcription text without post-processing. + Useful for debugging or custom post-processing pipelines. + segments: List of time segments indicating where speech was detected. + Useful for creating captions or navigating through the audio. + metadata: Metadata about the transcription and source audio file. + """ transcript: str raw_transcript: str @@ -108,14 +217,24 @@ def from_dict(cls, data: dict) -> TranscriptionResponse: @dataclass class StructuredResponse: - """Response from structured API.""" + """Response from structured data extraction API. + + Attributes: + results: Dictionary containing extracted structured data. The structure + depends on the form/schema configuration used. + """ results: dict[str, Any] @dataclass class SessionCloseResponse: - """Response from session close API.""" + """Response from the session close API. + + Attributes: + status: Status of the session closure operation. + deleted_at: ISO 8601 timestamp when the session was deleted. + """ status: str deleted_at: str @@ -123,7 +242,14 @@ class SessionCloseResponse: @dataclass class GrantTokenResponse: - """Response from grant token API.""" + """Response from the access token generation API. + + Attributes: + access_token: JWT access token for API authentication. This token has + an expiration time and should be validated before use. + session_id: Unique session identifier. Can be used to track and close + sessions. + """ access_token: str session_id: str @@ -131,24 +257,104 @@ class GrantTokenResponse: @dataclass class TranslationPayload: + """Configuration for translation task. + + Attributes: + src_lang_code: Source language code (e.g., 'en', 'es', 'fr'). + dst_lang_code: Destination language code (e.g., 'es', 'fr', 'de'). + """ + src_lang_code: str dst_lang_code: str @dataclass class TasksConfig: + """Configuration for AI tasks to run during transcription. + + Specify which AI-powered analysis tasks should be applied to the audio. + Each task can have its own configuration payload. + + Attributes: + TRANSLATION: Optional translation configuration. If provided, translates + transcribed text from source to destination language. + + Examples: + >>> config = TasksConfig( + ... TRANSLATION=TranslationPayload( + ... src_lang_code='en', + ... dst_lang_code='es' + ... ) + ... ) + """ + TRANSLATION: TranslationPayload | None = None @dataclass class VadConfig: + """Voice Activity Detection (VAD) configuration. + + Controls how the system detects speech and silence in audio streams, + affecting when transcription events are emitted and how audio is segmented. + + Attributes: + threshold: Probability threshold for speech detection (0.0 to 1.0). + Higher values make detection more conservative (less likely to + detect speech). Default is typically around 0.5. + min_speech_ms: Minimum duration of speech in milliseconds to trigger + detection. Speech shorter than this will be ignored, reducing + false positives. Default is typically 250ms. + min_silence_ms: Minimum duration of silence in milliseconds to split + speech segments. Pauses shorter than this won't split the segment. + Default is typically 500ms. + max_segment_ms: Maximum duration of a speech segment in milliseconds. + Prevents extremely long segments. Default is typically 30000ms + (30 seconds). + + Examples: + >>> # More conservative detection with longer segments + >>> vad = VadConfig( + ... threshold=0.6, + ... min_speech_ms=300, + ... min_silence_ms=700, + ... max_segment_ms=15000 + ... ) + """ + threshold: float | None = None min_speech_ms: float | None = None min_silence_ms: float | None = None max_segment_ms: float | None = None +"""Type alias for file content that can be uploaded.""" FileContent = Union[IO[bytes], bytes, str] + +"""Type alias for file input supporting various formats. + +Supports: +- FileContent: Direct file object, bytes, or file path string +- (filename, FileContent): Tuple with optional filename and content +- (filename, FileContent, content_type): Tuple with filename, content, and MIME type +- (filename, FileContent, content_type, headers): Complete tuple with all metadata + +Examples: + >>> # File path (string) + >>> file = 'audio.wav' + + >>> # File object + >>> file = open('audio.wav', 'rb') + + >>> # Bytes + >>> file = audio_bytes + + >>> # With filename + >>> file = ('myfile.wav', audio_bytes) + + >>> # With content type + >>> file = ('myfile.wav', audio_bytes, 'audio/wav') +""" File = Union[ # file (or bytes) FileContent,