From 2e94820164e7955b86fe574d0cb6573b0ad41f33 Mon Sep 17 00:00:00 2001
From: Mikolaj Wojciech Gorski <mikolajg@opencodebox.com>
Date: Sat, 26 Jul 2025 14:26:18 +0200
Subject: [PATCH] Updated the docs to focus on a local only stack instead of
 one relient on services like OpenAI, Eleven labs and so on.

---
 README.md                       |  52 +++++-----
 docs/README.md                  |  72 +++++++-------
 docs/ai-architecture.md         | 126 ++++++++++++------------
 docs/commands.md                |  25 +++--
 docs/docker-compose-examples.md | 167 ++++++++++++++++++++++++++++++++
 docs/setup.md                   | 142 +++++++++++++++++++--------
 docs/troubleshooting.md         |  81 +++++++++++++++-
 7 files changed, 489 insertions(+), 176 deletions(-)
 create mode 100644 docs/docker-compose-examples.md
diff --git a/README.md b/README.md
index 4cdfb55..3ff33ef 100644
--- a/README.md
+++ b/README.md
@@ -35,14 +35,18 @@ Kasane Teto is your server's AI companion who can:
 
 ## 🚀 Quick Start
 
+> [!IMPORTANT]
+> This project is designed to run exclusively within Docker containers. Bare-metal installation is not officially supported. All instructions assume a working Docker environment.
+
 1. **Setup Environment**
    ```bash
    git clone <repository-url>
    cd discord_teto
    
-   # Configure AI and Discord credentials
+   # Configure Discord credentials & local AI endpoints
    export USER_TOKEN="your_discord_token"
-   export OPENAI_API_KEY="your_openai_key"  # or other AI provider
+   export VLLM_ENDPOINT="http://localhost:8000" # Or your vLLM server
+   export WYOMING_ENDPOINT="http://localhost:10300" # Or your Wyoming server
    ```
 
 2. **Start Teto**
@@ -106,10 +110,11 @@ src/
 ```
 
 ### AI Integration
-- **Language Model**: GPT-4/Claude/Local LLM for conversation
-- **Vision Model**: CLIP/GPT-4V for image understanding
-- **Voice Synthesis**: Eleven Labs/Azure Speech for Teto's voice
-- **Memory System**: Vector database for conversation history
+- **Language Model**: Self-hosted LLM via `vLLM` (OpenAI compatible endpoint)
+- **Vision Model**: Multi-modal models served through `vLLM`
+- **Voice Synthesis**: `Piper` TTS via `Wyoming` protocol
+- **Speech Recognition**: `Whisper` STT via `Wyoming` protocol
+- **Memory System**: Local vector database for conversation history
 - **Personality Engine**: Custom prompt engineering for character consistency
 
 ## 🎭 Teto's Personality
@@ -157,21 +162,19 @@ src/
 
 ## 🔧 Configuration
 
-### AI Provider Setup
+### Local AI Provider Setup
 ```env
-# OpenAI (recommended)
-OPENAI_API_KEY=your_openai_key
-OPENAI_MODEL=gpt-4-turbo-preview
+# Local vLLM Server (OpenAI Compatible)
+VLLM_ENDPOINT="http://localhost:8000/v1"
+LOCAL_MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.2" # Or your preferred model
 
-# Alternative: Anthropic Claude
-ANTHROPIC_API_KEY=your_claude_key
+# Wyoming Protocol for Voice (Piper TTS / Whisper STT)
+WYOMING_HOST="localhost"
+WYOMING_PORT="10300"
+PIPER_VOICE="en_US-lessac-medium"
 
-# Voice Synthesis
-ELEVENLABS_API_KEY=your_elevenlabs_key
-TETO_VOICE_ID=kasane_teto_voice_clone
-
-# Vision Capabilities  
-VISION_MODEL=gpt-4-vision-preview
+# Vision Capabilities are enabled if the vLLM model is multi-modal
+VISION_ENABLED=true
 ```
 
 ### Personality Customization
@@ -196,6 +199,8 @@ export const TETO_PERSONALITY = {
 
 ## 🐳 Docker Deployment
 
+This project is officially supported for **Docker deployments only**. The container-first approach is critical for managing the complex local AI stack, ensuring that all services, dependencies, and configurations operate together consistently.
+
 ### Production Setup
 ```bash
 # Start Teto with all AI capabilities
@@ -206,10 +211,11 @@ docker compose logs -f teto_ai
 ```
 
 ### Resource Requirements
-- **Memory**: 4GB+ recommended for AI processing
-- **CPU**: Multi-core for real-time AI inference
-- **Storage**: SSD recommended for fast model loading
-- **Network**: Stable connection for AI API calls
+- **VRAM**: 8GB+ for 7B models, 24GB+ for larger models
+- **Memory**: 16GB+ RAM recommended
+- **CPU**: Modern multi-core CPU
+- **Storage**: Fast SSD for model weights (15GB+ per model)
+- **Network**: Local network for inter-service communication
 
 ## 🔐 Privacy & Ethics
 
@@ -292,7 +298,7 @@ This project is for educational and community use. Please ensure compliance with
 ---
 
 **Version**: 3.0.0 (AI-Powered)  
-**AI Models**: GPT-4, CLIP, Eleven Labs  
+**AI Stack**: Local-First (vLLM, Piper, Whisper)
 **Runtime**: Node.js 20+ with Docker  
 
 Bring Kasane Teto to life in your Discord server! 🎵✨
diff --git a/docs/README.md b/docs/README.md
index be4f04f..86d6c75 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -17,9 +17,9 @@ Unlike simple command bots, Teto engages in genuine conversations, remembers pas
 ## 📚 Documentation Structure
 
 ### 🚀 Getting Started
-- **[Setup Guide](setup.md)** - Complete installation and AI configuration
+- **[Setup Guide](setup.md)** - Complete installation and local AI stack configuration
 - **[Quick Start](../README.md#quick-start)** - Get Teto running in 5 minutes
-- **[Configuration](configuration.md)** - AI models, personality, and customization
+- **[Configuration](configuration.md)** - Local models, personality, and customization
 
 ### 💬 Interacting with Teto
 - **[Conversation Guide](interactions.md)** - How to chat naturally with Teto
@@ -28,10 +28,10 @@ Unlike simple command bots, Teto engages in genuine conversations, remembers pas
 - **[Voice Interaction](voice.md)** - Speaking with Teto in voice channels
 
 ### 🧠 AI Capabilities
-- **[AI Architecture](ai-architecture.md)** - How Teto's AI systems work
-- **[Vision System](vision.md)** - Image analysis and visual understanding
-- **[Memory System](memory.md)** - How Teto remembers conversations
-- **[Personality Engine](personality-engine.md)** - Character consistency and roleplay
+- **[AI Architecture](ai-architecture.md)** - How Teto's local AI systems work
+- **[Vision System](vision.md)** - Image analysis with local multi-modal models
+- **[Memory System](memory.md)** - How Teto remembers conversations locally
++- **Personality Engine](personality-engine.md)** - Character consistency and roleplay
 
 ### 🔧 Technical Documentation
 - **[Architecture Overview](architecture.md)** - System design and components
@@ -41,15 +41,15 @@ Unlike simple command bots, Teto engages in genuine conversations, remembers pas
 
 ### 🛠️ Operations & Support
 - **[Troubleshooting](troubleshooting.md)** - Common issues and solutions
-- **[Performance Tuning](performance.md)** - Optimization for your server
-- **[Security & Privacy](security.md)** - Data handling and safety considerations
+- **[Performance Tuning](performance.md)** - Optimizing your local AI stack
++- **[Security & Privacy](security.md)** - Data handling and safety in a local-first setup
 
 ## 🎯 Quick Navigation by Use Case
 
 ### "I want to set up Teto for the first time"
-1. [Setup Guide](setup.md) - Installation and configuration
-2. [Configuration](configuration.md) - AI API keys and personality setup
-3. [Docker Guide](docker.md) - Container deployment
+1. [Setup Guide](setup.md) - Installation and local AI stack configuration
+2. [Configuration](configuration.md) - vLLM, Piper, and Whisper setup
+3. [Docker Guide](docker.md) - Multi-container deployment for AI services
 
 ### "I want to understand how to interact with Teto"
 1. [Conversation Guide](interactions.md) - Natural chat examples
@@ -58,7 +58,7 @@ Unlike simple command bots, Teto engages in genuine conversations, remembers pas
 
 ### "I want to understand Teto's capabilities"
 1. [Personality Guide](personality.md) - Character traits and style
-2. [Vision System](vision.md) - Image and video analysis
+2. [Vision System](vision.md) - Image analysis with local models
 3. [AI Architecture](ai-architecture.md) - Technical capabilities
 
 ### "I want to customize or develop features"
@@ -68,8 +68,8 @@ Unlike simple command bots, Teto engages in genuine conversations, remembers pas
 
 ### "I'm having issues or want to optimize"
 1. [Troubleshooting](troubleshooting.md) - Problem solving
-2. [Performance Tuning](performance.md) - Optimization tips
-3. [Security & Privacy](security.md) - Best practices
+2. [Performance Tuning](performance.md) - Optimizing your local AI stack
++- **[Security & Privacy](security.md)** - Best practices for a local-first setup
 
 ## 🌟 Key Features Overview
 
@@ -94,11 +94,12 @@ Carefully crafted personality engine ensures Teto maintains consistent character
 ## 🔧 Technical Architecture
 
 ```
-Teto AI System
-├── Language Model (GPT-4/Claude)    # Natural conversation
-├── Vision Model (GPT-4V/CLIP)       # Image/video analysis  
-├── Voice Synthesis (ElevenLabs)     # Speech generation
-├── Memory System (Vector DB)        # Conversation history
+Teto Local AI System
+├── Language Model (vLLM)            # Self-hosted natural conversation
+├── Vision Model (vLLM Multi-modal)  # Self-hosted image/video analysis  
+├── Voice Synthesis (Piper TTS)      # Local speech generation via Wyoming
+├── Speech Recognition (Whisper STT) # Local speech recognition via Wyoming
+├── Memory System (Local Vector DB)  # Local conversation history
 ├── Personality Engine               # Character consistency
 └── Discord Integration              # Platform interface
 ```
@@ -106,23 +107,24 @@ Teto AI System
 ## 📋 System Requirements
 
 ### Minimum Requirements
-- **RAM**: 4GB (AI model loading)
-- **CPU**: Multi-core (real-time inference)
-- **Storage**: 10GB (models and data)
-- **Network**: Stable connection (AI API calls)
+- **VRAM**: 8GB+ for 7B models (required for `vLLM`)
+- **RAM**: 16GB+ (for models and system)
+- **CPU**: Modern multi-core (for processing)
+- **Storage**: 15GB+ SSD (for model weights)
+- **Network**: Local network for inter-service communication
 
 ### Recommended Setup
-- **RAM**: 8GB+ for optimal performance
-- **CPU**: Modern multi-core processor
-- **Storage**: SSD for fast model access
-- **GPU**: Optional but beneficial for local inference
+- **VRAM**: 24GB+ for larger models or concurrent tasks
+- **RAM**: 32GB+ for smoother operation
+- **Storage**: NVMe SSD for fast model loading
+- **GPU**: Required for `vLLM` and `Whisper`
 
 ## 🚦 Getting Started Checklist
 
 - [ ] Read the [Setup Guide](setup.md)
-- [ ] Obtain necessary API keys (OpenAI, ElevenLabs, etc.)
-- [ ] Configure Discord token and permissions
-- [ ] Deploy using Docker or run locally
+- [ ] Download required model weights (LLM, TTS, etc.)
+- [ ] Configure local endpoints for `vLLM` and `Wyoming`
+- [ ] Deploy multi-container stack using Docker
 - [ ] Customize personality settings
 - [ ] Test basic conversation features
 - [ ] Explore voice and vision capabilities
@@ -143,12 +145,12 @@ See the [Development Guide](development.md) for detailed contribution guidelines
 - **Technical Issues**: Check [Troubleshooting](troubleshooting.md)
 - **Setup Problems**: Review [Setup Guide](setup.md)
 - **Feature Questions**: See [Commands Reference](commands.md)
-- **AI Behavior**: Read [Personality Guide](personality.md)
++- **AI Behavior**: Read [Personality Guide](personality.md)
 
 ### Best Practices
-- **Privacy First**: Always respect user consent and data privacy
+- **Privacy First**: All data is processed locally, ensuring maximum privacy
 - **Appropriate Content**: Maintain family-friendly interactions
-- **Resource Management**: Monitor AI API usage and costs
+- **Resource Management**: Monitor local GPU and CPU usage
 - **Community Guidelines**: Foster positive server environments
 
 ## 📊 Documentation Stats
@@ -163,10 +165,10 @@ See the [Development Guide](development.md) for detailed contribution guidelines
 
 The documentation will continue to evolve with new features:
 - **Advanced Memory Systems** - Long-term relationship building
-- **Custom Voice Training** - Personalized Teto voice models  
+- **Custom Voice Training** - Fine-tuning `Piper` for a unique Teto voice
 - **Multi-Server Consistency** - Shared personality across servers
 - **Game Integration** - Interactive gaming experiences
-- **Creative Tools** - Music and art generation capabilities
+- **Creative Tools** - Music and art generation with local models
 
 ---
 
diff --git a/docs/ai-architecture.md b/docs/ai-architecture.md
index cb210e9..60a9d2b 100644
--- a/docs/ai-architecture.md
+++ b/docs/ai-architecture.md
@@ -26,34 +26,34 @@ This document provides a comprehensive overview of how Kasane Teto's AI systems
 ### Core Components
 
 **1. AI Orchestration Layer**
-- Coordinates between different AI services
+- Coordinates between different local AI services
 - Manages context flow and decision routing
 - Handles multi-modal input integration
 - Ensures personality consistency across modalities
 
-**2. Language Model Integration**
-- Primary conversational intelligence (GPT-4/Claude)
-- Context-aware response generation
-- Personality-guided prompt engineering
+**2. Language Model Integration (vLLM)**
+- Self-hosted conversational intelligence via `vLLM`
+- Context-aware response generation through OpenAI-compatible API
+- Personality-guided prompt engineering for local models
 - Multi-turn conversation management
 
-**3. Vision Processing System**
-- Image analysis and understanding
+**3. Vision Processing System (vLLM Multi-modal)**
+- Image analysis using local multi-modal models
 - Video frame processing for streams
 - Visual context integration with conversations
 - Automated response generation for visual content
 
-**4. Voice Synthesis & Recognition**
-- Text-to-speech with Teto's voice characteristics
-- Speech-to-text for voice command processing
-- Emotional tone and inflection control
+**4. Voice Synthesis & Recognition (Wyoming Protocol)**
+- Text-to-speech using `Piper` for Teto's voice characteristics
+- Speech-to-text using `Whisper` for voice command processing
+- Emotional tone and inflection control via TTS models
 - Real-time voice conversation capabilities
 
-**5. Memory & Context System**
-- Long-term conversation history storage
+**5. Memory & Context System (Local)**
+- Local long-term conversation history storage (e.g., ChromaDB)
 - User preference and relationship tracking
 - Context retrieval for relevant conversations
-- Semantic search across past interactions
+- Local semantic search across past interactions
 
 **6. Personality Engine**
 - Character consistency enforcement
@@ -138,24 +138,25 @@ Image Upload → Image Processing → Vision Model → Context Integration → R
 ### Voice Interaction Flow
 
 ```
-Voice Channel Join → Audio Processing → Speech Recognition → Text Processing → Voice Synthesis → Audio Output
-                           ↓                  ↓                    ↓               ↓
-                    Noise Filtering → Intent Detection → LLM Response → Voice Cloning
+Voice Channel Join → Audio Processing (Whisper) → Text Processing (vLLM) → Voice Synthesis (Piper) → Audio Output
+                           ↓                        ↓                        ↓
+                    Noise Filtering →         Intent Detection →      LLM Response →        Voice Model
 ```
 
 ## 🧩 AI Service Integration
 
-### Language Model Configuration
+### Language Model Configuration (vLLM)
 
-**Primary Model: GPT-4 Turbo**
+**vLLM with OpenAI-Compatible Endpoint:**
 ```javascript
-const LLM_CONFIG = {
-  model: "gpt-4-turbo-preview",
-  temperature: 0.8,        // Creative but consistent
-  max_tokens: 1000,        // Reasonable response length
-  top_p: 0.9,             // Focused but diverse
-  frequency_penalty: 0.3,  // Reduce repetition
-  presence_penalty: 0.2    // Encourage topic exploration
+const VLLM_CONFIG = {
+  endpoint: "http://localhost:8000/v1", // Your vLLM server
+  model: "mistralai/Mistral-7B-Instruct-v0.2", // Or your preferred model
+  temperature: 0.7,        // Creative yet grounded
+  max_tokens: 1500,        // Max response length
+  top_p: 0.9,             // Focused sampling
+  frequency_penalty: 0.2,  // Reduce repetition
+  presence_penalty: 0.1    // Encourage topic exploration
 };
 ```
 
@@ -166,45 +167,43 @@ USER: Conversation history + current message + visual context (if any)
 ASSISTANT: Previous Teto responses for consistency
 ```
 
-### Vision Model Integration
+### Vision Model Integration (vLLM Multi-modal)
 
 **Model Stack:**
-- **GPT-4 Vision** - Primary image understanding
-- **CLIP** - Image-text similarity for context matching
-- **Custom Fine-tuning** - Teto-specific visual preferences
+- **Local Multi-modal Model** - (e.g., LLaVA, Idefics) served via `vLLM`
+- **CLIP** - Local image-text similarity for context matching
+- **Custom Fine-tuning** - Potential for Teto-specific visual preferences
 
 **Processing Pipeline:**
 ```javascript
 const processImage = async (imageUrl, conversationContext) => {
-  // Multi-model analysis for comprehensive understanding
-  const gpt4Analysis = await analyzeWithGPT4V(imageUrl);
-  const clipEmbedding = await getCLIPEmbedding(imageUrl);
+  // Local multi-modal analysis
+  const localAnalysis = await analyzeWithVLLM(imageUrl);
+  const clipEmbedding = await getLocalCLIPEmbedding(imageUrl);
   const contextMatch = await findSimilarImages(clipEmbedding);
   
   return {
-    description: gpt4Analysis.description,
-    emotions: gpt4Analysis.emotions,
+    description: localAnalysis.description,
+    emotions: localAnalysis.emotions,
     relevantMemories: contextMatch,
-    responseStyle: determineResponseStyle(gpt4Analysis, conversationContext)
+    responseStyle: determineResponseStyle(localAnalysis, conversationContext)
   };
 };
 ```
 
-### Voice Synthesis Setup
+### Voice I/O Setup (Wyoming Protocol)
 
-**ElevenLabs Configuration:**
+**Piper TTS and Whisper STT via Wyoming:**
 ```javascript
-const VOICE_CONFIG = {
-  voice_id: "kasane_teto_voice_clone",
-  model_id: "eleven_multilingual_v2",
-  stability: 0.75,         // Consistent voice characteristics
-  similarity_boost: 0.8,   // Maintain Teto's voice signature
-  style: 0.6,             // Moderate emotional expression
-  use_speaker_boost: true  // Enhanced clarity
+const WYOMING_CONFIG = {
+  host: "localhost",
+  port: 10300,
+  piper_voice: "en_US-lessac-medium", // Or a custom-trained Teto voice
+  whisper_model: "base.en" // Or larger model depending on resources
 };
 ```
 
-### Memory System Architecture
+### Memory System Architecture (Local)
 
 **Vector Database Structure:**
 ```javascript
@@ -324,10 +323,10 @@ const safetyPipeline = async (content, context) => {
 ### Privacy Protection
 
 **Data Handling Principles:**
-- **Local Memory Storage** - Conversation history stored locally, not sent to external services
-- **Anonymized Analytics** - Usage patterns tracked without personal identifiers
-- **Selective Context** - Only relevant conversation context sent to AI models
-- **User Consent** - Clear communication about data usage and AI processing
+- **Complete Privacy** - All data, including conversations, images, and voice, is processed locally.
+- **No External Data Transfer** - AI processing does not require sending data to third-party services.
+- **Full User Control** - Users have complete control over their data and the AI models.
+- **User Consent** - Clear communication that all processing is done on the user's own hardware.
 
 ## 📊 Performance Optimization
 
@@ -385,21 +384,18 @@ const processMessageAsync = async (message) => {
 
 ### Resource Management
 
-**Model Loading Strategy:**
+**Model Loading Strategy (for vLLM):**
 ```javascript
-const MODEL_LOADING = {
-  // Keep language model always loaded
-  language_model: "persistent",
-  
-  // Load vision model on demand
-  vision_model: "on_demand",
-  
-  // Pre-load voice synthesis during voice channel activity
-  voice_synthesis: "predictive",
-  
-  // Cache embeddings for frequent users
-  user_embeddings: "lru_cache"
+// This is typically managed by the vLLM server instance itself.
+// The configuration would involve which models to load on startup.
+const VLLM_SERVER_ARGS = {
+  model: "mistralai/Mistral-7B-Instruct-v0.2",
+  "tensor-parallel-size": 1, // Or more depending on GPU count
+  "gpu-memory-utilization": 0.9, // Use 90% of GPU memory
+  "max-model-len": 4096,
 };
+
+// Wyoming services for Piper/Whisper are typically persistent.
 ```
 
 ## 🔧 Configuration & Customization
@@ -443,14 +439,14 @@ const TUNABLE_PARAMETERS = {
 const getModelConfig = (environment) => {
   const configs = {
     development: {
-      model: "gpt-3.5-turbo",
+      model: "local-dev-model/gguf", // Smaller model for dev
       response_time_target: 3000,
       logging_level: "debug",
       cache_enabled: false
     },
     
     production: {
-      model: "gpt-4-turbo-preview",
+      model: "mistralai/Mistral-7B-Instruct-v0.2",
       response_time_target: 1500,
       logging_level: "info",
       cache_enabled: true,
diff --git a/docs/commands.md b/docs/commands.md
index a49b083..5d172de 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -303,13 +303,12 @@ How long did this take you to create? I'm in awe! ✨"
 **Example Response**:
 ```
 🤖 **Teto Status Report**
-💭 AI Systems: All operational! 
-🎤 Voice: Ready to chat in voice channels
-👀 Vision: Image analysis active
-🧠 Memory: 1,247 conversations remembered
+💭 AI Systems: All local services operational!
+🚀 vLLM: `mistralai/Mistral-7B-Instruct-v0.2` (Online)
+🎤 Wyoming: Piper TTS & Whisper STT (Online)
+🧠 Memory: Local Vector DB (1,247 conversations)
 ✨ Mood: Cheerful and energetic!
 ⏰ Been active for 3 hours today
-🎵 Currently listening to: Lo-fi beats
 ```
 
 ---
@@ -441,16 +440,16 @@ how you finally managed it!"
 ## ⚠️ Important Notes
 
 ### Privacy & Consent
-- All interactions are processed through AI systems
-- Conversation history is stored locally for continuity
-- Visual content is analyzed but not permanently stored
-- Voice interactions may be temporarily cached for processing
+- All interactions are processed by your self-hosted AI stack. No data is sent to external third-party services.
+- Conversation history is stored in your local vector database.
+- Visual content is analyzed by your local multi-modal model and is not stored unless recorded.
+- Voice is processed locally via the Wyoming protocol (Piper/Whisper).
 
 ### Limitations
-- Response time varies with AI model load (typically 1-3 seconds)
-- Complex image analysis may take slightly longer
-- Voice synthesis has brief processing delay
-- Memory system focuses on significant interactions
+- Response time depends entirely on your local hardware (GPU, CPU, RAM).
+- The quality and capabilities of Teto depend on the models you choose to run.
+- Requires significant VRAM (8GB+ for basic models, 24GB+ for larger ones).
+- Initial setup and configuration of the local AI stack can be complex.
 
 ### Ethics & Safety
 - Teto is programmed to maintain appropriate, family-friendly interactions
diff --git a/docs/docker-compose-examples.md b/docs/docker-compose-examples.md
new file mode 100644
index 0000000..21590d3
--- /dev/null
+++ b/docs/docker-compose-examples.md
@@ -0,0 +1,167 @@
+# Docker Compose Examples for Local AI Stack
+
+This document provides production-ready `docker-compose.yml` examples for setting up the self-hosted AI services required by the Teto AI Companion bot. These services should be included in the same `docker-compose.yml` file as the `teto_ai` bot service itself to ensure proper network communication.
+
+> [!IMPORTANT]
+> These examples require a host machine with an NVIDIA GPU and properly installed drivers. They use CDI (Container Device Interface) for GPU reservations, which is the modern standard for Docker.
+
+## 🤖 vLLM Service (Language & Vision Model)
+
+This service uses `vLLM` to serve a powerful language model with an OpenAI-compatible API endpoint. This allows Teto to perform natural language understanding and generation locally. If you use a multi-modal model, this service will also provide vision capabilities.
+
+```yaml
+services:
+  vllm-openai:
+    # This section reserves GPU resources for the container.
+    # It ensures vLLM has exclusive access to the NVIDIA GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: cdi
+              device_ids: ['nvidia.com/gpu=all']
+              capabilities: ['gpu']
+    # Mount local directories for model weights and cache.
+    # This prevents re-downloading models on every container restart.
+    volumes:
+      - /path/to/your/llm_models/hf_cache:/root/.cache/huggingface
+      - /path/to/your/llm_models:/root/LLM_models
+    # Map the container's port 8000 to a host port (e.g., 11434).
+    # Your .env file should point to this host port.
+    ports:
+      - "11434:8000"
+    environment:
+      # (Optional) Add your Hugging Face token if needed for private models.
+      - HUGGING_FACE_HUB_TOKEN=your_hf_token_here
+      # Optimizes PyTorch memory allocation, can improve performance.
+      - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512,garbage_collection_threshold:0.8
+    # Necessary for multi-GPU communication and performance.
+    ipc: host
+    image: vllm/vllm-openai:latest
+    # --- vLLM Command Line Arguments ---
+    # These arguments configure how vLLM serves the model.
+    # Adjust them based on your model and hardware.
+    command: >
+      --model jeffcookio/Mistral-Small-3.2-24B-Instruct-2506-awq-sym
+      --tensor-parallel-size 2          # Number of GPUs to use.
+      --max-model-len 32256             # Maximum context length.
+      --limit-mm-per-prompt image=4     # For multi-modal models.
+      --enable-auto-tool-choice         # For models that support tool use.
+      --tool-call-parser mistral
+      --enable-chunked-prefill
+      --disable-log-stats
+      --gpu-memory-utilization 0.75     # Use 75% of GPU VRAM.
+      --enable-prefix-caching
+      --max-num-seqs 4                  # Max concurrent sequences.
+      --served-model-name Mistral-Small-3.2
+```
+
+### vLLM Configuration Notes
+-   **`--model`**: Specify the Hugging Face model identifier you want to serve.
+-   **`--tensor-parallel-size`**: Set this to the number of GPUs you want to use for a single model. For a single GPU, this should be `1`.
+-   **`--gpu-memory-utilization`**: Adjust this value based on your VRAM. `0.75` (75%) is a safe starting point.
+-   Check the [official vLLM documentation](https://docs.vllm.ai/en/latest/) for the latest command-line arguments and supported models.
+
+## 🎤 Wyoming Voice Services (Piper TTS & Whisper STT)
+
+These services provide Text-to-Speech (`Piper`) and Speech-to-Text (`Whisper`) capabilities over the `Wyoming` protocol. They run as separate containers but are managed within the same Docker Compose file.
+
+```yaml
+services:
+  # --- Whisper STT Service ---
+  # Converts speech from the voice channel into text for Teto to understand.
+  wyoming-whisper:
+    image: slackr31337/wyoming-whisper-gpu:latest
+    container_name: wyoming-whisper
+    environment:
+      # Configure the Whisper model size and language.
+      # Smaller models are faster but less accurate.
+      - MODEL=base-int8
+      - LANGUAGE=en
+      - COMPUTE_TYPE=int8
+      - BEAM_SIZE=5
+    ports:
+      # Exposes the Wyoming protocol port for Whisper.
+      - "10300:10300"
+    volumes:
+      # Mount a volume to persist Whisper model data.
+      - /path/to/your/whisper_data:/data
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: cdi
+              device_ids: ['nvidia.com/gpu=all']
+              capabilities: ['gpu']
+
+  # --- Piper TTS Service ---
+  # Converts Teto's text responses into speech.
+  wyoming-piper:
+    image: slackr31337/wyoming-piper-gpu:latest
+    container_name: wyoming-piper
+    environment:
+      # Specify which Piper voice model to use.
+      - PIPER_VOICE=en_US-amy-medium
+    ports:
+      # Exposes the Wyoming protocol port for Piper.
+      - "10200:10200"
+    volumes:
+      # Mount a volume to persist Piper voice models.
+      - /path/to/your/piper_data:/data
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: cdi
+              device_ids: ['nvidia.com/gpu=all']
+              capabilities: ['gpu']
+```
+
+### Wyoming Configuration Notes
+-   **Multiple Ports**: Note that `Whisper` and `Piper` listen on different ports (`10300` and `10200` in this example). Your bot's configuration will need to point to the correct service and port.
+-   **Voice Models**: You can download different `Piper` voice models and place them in your persistent data directory to change Teto's voice.
+-   **GPU Usage**: These images are for GPU-accelerated voice processing. If your GPU is dedicated to `vLLM`, you may consider using CPU-based images for Wyoming to conserve VRAM.
+
+## 🌐 Networking
+
+For the services to communicate with each other, they must share a Docker network. Using an external network is a good practice for managing complex applications.
+
+```yaml
+# Add this to the bottom of your docker-compose.yml file
+networks:
+  backend:
+    external: true
+```
+
+Before starting your stack, create the network manually:
+```bash
+docker network create backend
+```
+
+Then, ensure each service in your `docker-compose.yml` (including the `teto_ai` bot) is attached to this network:
+
+```yaml
+services:
+  teto_ai:
+    # ... your bot's configuration
+    networks:
+      - backend
+
+  vllm-openai:
+    # ... vllm configuration
+    networks:
+      - backend
+
+  wyoming-whisper:
+    # ... whisper configuration
+    networks:
+      - backend
+
+  wyoming-piper:
+    # ... piper configuration
+    networks:
+      - backend
+```
+This allows the Teto bot to communicate with `vllm-openai`, `wyoming-whisper`, and `wyoming-piper` using their service names as hostnames.
\ No newline at end of file
diff --git a/docs/setup.md b/docs/setup.md
index 6904775..d3c0f3c 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -5,16 +5,22 @@ This guide will walk you through setting up the Discord Teto Bot for video recor
 ## 📋 Prerequisites
 
 ### System Requirements
-- **Operating System**: Linux, macOS, or Windows with WSL2
-- **Docker**: Version 20.10+ and Docker Compose v2+
-- **Disk Space**: Minimum 2GB for container, additional space for recordings
-- **Memory**: 4GB RAM recommended (2GB minimum)
-- **Network**: Stable internet connection for Discord API
+- **Operating System**: Linux is strongly recommended for GPU support. Windows with WSL2 is possible.
+- **GPU**: NVIDIA GPU with 8GB+ VRAM is required for local model hosting.
+- **Docker**: Version 20.10+ and Docker Compose v2+.
+- **Disk Space**: 20GB+ SSD for models and container images.
+- **Memory**: 16GB+ RAM recommended.
+- **Network**: Local network for inter-service communication.
 
 ### Discord Requirements
-- Discord account with user token
-- Server permissions to join voice channels
-- Voice channel access where you want to record
+- Discord account with user token.
+- Server permissions to join voice channels.
+- Voice channel access where you want to record.
+
+### Local AI Requirements
+- **LLM/VLM Model**: A downloaded language model compatible with `vLLM` (e.g., from Hugging Face).
+- **TTS Voice Model**: A downloaded `Piper` voice model.
+- **STT Model**: A downloaded `Whisper` model.
 
 ### Development Prerequisites (Optional)
 - **Node.js**: Version 20+ for local development
@@ -32,14 +38,20 @@ cd discord_teto
 
 ### Step 2: Environment Configuration
 
-Create environment variables for your Discord token:
+Create environment variables for your Discord token and local AI endpoints:
 
 ```bash
 # Method 1: Export in terminal session
 export USER_TOKEN="your_discord_user_token_here"
+export VLLM_ENDPOINT="http://localhost:8000/v1"
+export WYOMING_HOST="localhost"
+export WYOMING_PORT="10300"
 
 # Method 2: Create .env file (recommended)
 echo "USER_TOKEN=your_discord_user_token_here" > .env
+echo "VLLM_ENDPOINT=http://localhost:8000/v1" >> .env
+echo "WYOMING_HOST=localhost" >> .env
+echo "WYOMING_PORT=10300" >> .env
 ```
 
 **Getting Your Discord Token:**
@@ -50,24 +62,38 @@ echo "USER_TOKEN=your_discord_user_token_here" > .env
 5. Look for requests to `discord.com/api`
 6. Find Authorization header starting with your token
 
-⚠️ **Security Warning**: Never share your Discord token publicly or commit it to version control.
+⚠️ **Security Warning**: Never share your Discord token publicly or commit it to version control. The bot operates on a user token and has the same permissions as your user.
 
-### Step 3: Directory Setup
+### Step 3: Model & Directory Setup
 
-Create the output directory for recordings:
+1. **Create Directories**
+   Create directories for recordings and for your AI models.
+   ```bash
+   mkdir -p output models/piper models/whisper models/llm
+   chmod 755 output models
+   ```
+   This `models` directory will be mounted into your AI service containers.
 
-```bash
-mkdir -p output
-chmod 755 output
-```
+2. **Download AI Models**
+   - **Language Model**: Download your chosen GGUF or other `vLLM`-compatible model and place it in `models/llm`.
+   - **Voice Model (Piper)**: Download a `.onnx` and `.json` voice file for Piper and place them in `models/piper`.
+   - **Speech-to-Text Model (Whisper)**: The Whisper service will download its model on first run, or you can pre-download it.
 
-This directory will be mounted into the Docker container to persist recordings.
+This directory will be mounted into the Docker container to persist recordings and provide models to the AI services.
 
-### Step 4: Docker Container Setup
+### Step 4: Local AI Stack & Bot Setup
+
+This project uses a multi-container Docker setup for the bot and its local AI services. Your `docker-compose.yml` file should define services for:
+- `teto_ai`: The bot itself.
+- `vllm-openai`: The language model server, providing an OpenAI-compatible endpoint.
+- `wyoming-piper`: The Text-to-Speech (TTS) service.
+- `wyoming-whisper`: The Speech-to-Text (STT) service.
+
+Below are sanitized, production-ready examples for these services. For full configuration details and explanations, please see the [Docker Compose Examples](docker-compose-examples.md) guide.
 
 #### Production Setup
 ```bash
-# Build and start the container
+# Build and start all containers
 docker compose up --build
 
 # Or run in background
@@ -110,16 +136,19 @@ docker compose -f docker-compose.dev.yml up --build --no-deps
 
 ### Environment Variables
 
-Create a `.env` file in the project root:
+Create a `.env` file in the project root to configure the bot and its connections to the local AI services:
 
 ```env
-# Required
+# Required: Discord Token
 USER_TOKEN=your_discord_user_token
 
-# Optional
-BOT_CLIENT_ID=your_bot_application_id
-BOT_CLIENT_SECRET=your_bot_secret
-BOT_REDIRECT_URI=https://your-domain.com/auth/callback
+# Required: Local AI Service Endpoints
+VLLM_ENDPOINT="http://vllm:8000/v1" # Using Docker service name
+VLLM_MODEL="mistralai/Mistral-7B-Instruct-v0.2" # Model served by vLLM
+
+WYOMING_HOST="wyoming" # Using Docker service name
+WYOMING_PORT="10300"
+PIPER_VOICE="en_US-lessac-medium" # Voice model for Piper TTS
 
 # Recording Settings (optional)
 RECORDING_TIMEOUT=30000
@@ -176,17 +205,14 @@ export const VIDEO_CONFIG = {
 
 ## 🔒 Security Considerations
 
-### Token Security
-- Store tokens in environment variables, never in code
-- Use `.env` files for local development (add to `.gitignore`)
-- Consider using Docker secrets for production deployments
-- Rotate tokens regularly
+### Data Privacy & Security
+- **100% Local Processing**: All AI processing, including conversations, voice, and images, happens locally. No data is sent to external third-party services.
+- **Token Security**: Your Discord token should still be kept secure in a `.env` file or Docker secrets. Never commit it to version control.
+- **Network Isolation**: The AI services (`vLLM`, `Wyoming`) can be configured to only be accessible within the Docker network, preventing outside access.
 
 ### Container Security
-- Bot runs as non-root user inside container
-- Limited system capabilities (only SYS_ADMIN for Discord GUI)
-- Isolated filesystem with specific volume mounts
-- No network access beyond Discord API requirements
+- The bot and AI services run as non-root users inside their respective containers.
+- Filesystem access is limited via specific volume mounts for models and output.
 
 ### File Permissions
 ```bash
@@ -200,6 +226,36 @@ chmod 644 ./output/*.mkv  # For recorded files
 
 ## 🐛 Troubleshooting Setup Issues
 
+### Local AI Service Issues
+
+**1. vLLM Container Fails to Start**
+```bash
+# Check vLLM logs for errors
+docker compose logs vllm
+
+# Common issues:
+# - Insufficient GPU VRAM for the selected model.
+# - Incorrect model path or name.
+# - CUDA driver issues on the host machine.
+# - Forgetting to build with --pull to get the latest base image.
+```
+
+**2. Wyoming Service Not Responding**
+```bash
+# Check Wyoming protocol server logs
+docker compose logs wyoming
+
+# Common issues:
+# - Incorrect path to Piper voice models.
+# - Port conflicts on the host (port 10300).
+# - Whisper model download failure on first run.
+```
+
+**3. Teto Bot Can't Connect to AI Services**
+- Verify service names in your `.env` file match the service names in `docker-compose.yml` (e.g., `http://vllm:8000/v1`).
+- Ensure all containers are on the same Docker network.
+- Use `docker compose ps` to see if all containers are running and healthy.
+
 ### Common Installation Problems
 
 **1. Docker not found**
@@ -273,14 +329,22 @@ npm install
 
 ### Container Health
 ```bash
-# Check container status
+# Check status of all containers (bot, vllm, wyoming)
 docker compose ps
 
-# View resource usage
-docker stats teto_ai
+# View resource usage for all services
+docker stats
 
-# Monitor logs in real-time
-docker compose logs -f
+# Monitor logs for a specific service in real-time
+docker compose logs -f vllm
+docker compose logs -f wyoming
+docker compose logs -f teto_ai
+```
+
+### GPU Resource Monitoring
+```bash
+# Monitor GPU VRAM and utilization on the host machine
+watch -n 1 nvidia-smi
 ```
 
 ### Recording Status
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 654a3b7..3be5e1f 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -28,7 +28,86 @@ docker inspect teto_ai | grep -A 5 "Mounts"
 df -h ./output/
 ```
 
-## 🐳 Docker Issues
+## 🤖 Local AI Stack Issues
+
+### vLLM Service Issues
+
+**Problem**: The `vllm` container fails to start, crashes, or doesn't respond to requests.
+
+**Diagnosis**:
+```bash
+# Check the vLLM container logs for CUDA errors, model loading issues, etc.
+docker compose logs vllm
+
+# Check GPU resource usage on the host
+nvidia-smi
+```
+
+**Solutions**:
+
+1. **Insufficient VRAM**:
+   - The most common issue. Check the model's VRAM requirements.
+   - **Solution**: Use a smaller model (e.g., a 7B model requires ~8-10GB VRAM) or upgrade your GPU.
+
+2. **CUDA & Driver Mismatches**:
+   - The `vLLM` container requires a specific CUDA version on the host.
+   - **Solution**: Ensure your NVIDIA drivers are up-to-date and compatible with the CUDA version used in the `vLLM` Docker image.
+
+3. **Incorrect Model Path or Name**:
+   - The container can't find the model weights.
+   - **Solution**: Verify the volume mount in `docker-compose.yml` points to the correct local directory containing your models. Double-check the model name in your `.env` file.
+
+### Wyoming (Piper/Whisper) Service Issues
+
+**Problem**: The `wyoming` container is running, but Teto cannot speak or understand voice commands.
+
+**Diagnosis**:
+```bash
+# Check the Wyoming container logs for errors related to Piper or Whisper
+docker compose logs wyoming
+
+# Test the connection from another container
+docker exec -it teto_ai nc -zv wyoming 10300
+```
+
+**Solutions**:
+
+1. **Incorrect Piper Voice Model Path**:
+   - The service can't find the `.onnx` and `.json` files for the selected voice.
+   - **Solution**: Check your volume mounts and the voice name specified in your configuration.
+
+2. **Whisper Model Download Failure**:
+   - On first run, the service may fail to download the Whisper model.
+   - **Solution**: Ensure the container has internet access for the initial download, or manually place the model in the correct volume.
+
+3. **Port Conflict**:
+   - Another service on your host might be using port `10300`.
+   - **Solution**: Use `netstat -tulpn | grep 10300` to check for conflicts and remap the port in `docker-compose.yml` if needed.
+
+### Bot Can't Connect to Local AI Services
+
+**Problem**: The Teto bot is running but logs errors about being unable to reach `vllm` or `wyoming`.
+
+**Diagnosis**:
+```bash
+# Check the Teto bot logs for connection refused errors
+docker compose logs teto_ai
+
+# Ensure all services are on the same Docker network
+docker network inspect <your_network_name>
+```
+
+**Solutions**:
+
+1. **Incorrect Endpoint Configuration**:
+   - The `.env` file points to the wrong service name or port.
+   - **Solution**: Ensure `VLLM_ENDPOINT` and `WYOMING_HOST` use the correct service names as defined in `docker-compose.yml` (e.g., `vllm`, `wyoming`).
+
+2. **Docker Networking Issues**:
+   - The containers cannot resolve each other's service names.
+   - **Solution**: Ensure all services are defined within the same `docker-compose.yml` and share a common network.
+
+## 🐳 General Docker Issues
 
 ### Container Won't Start