tin2tin · May 12, 2025 10:21
diff --git a/__init__.py b/__init__.py
 bl_info = {
    "name": "AI Audio Generation Queue",
    "author": "Your Name (and contributors to the original code)",
    "version": (1, 0, 24), # No functional change for this issue, version kept
    "blender": (3, 4, 0),
    "location": "Video Sequence Editor > Sidebar (N-Panel) > AI Tools",
    "description": "Adds audio generation jobs to a queue, processes them asynchronously, and adds results to the VSE.",
    "warning": "Alpha version. Requires manual installation of AI model dependencies. Check console for errors.",
    "doc_url": "https://your-documentation-link-here.com",
    "category": "Sequencer",
 }

 import bpy
 import threading
 import time
 import os
 import uuid
 from bpy.props import (
    StringProperty,
    IntProperty,
    FloatProperty, 
    BoolProperty,
    EnumProperty,
    CollectionProperty,
    PointerProperty,
 )
 from bpy.types import Operator, Panel, PropertyGroup
 import random
 from fractions import Fraction 
 from math import gcd 

 try:
    import torch
 except ImportError:
    # Use bl_info.get("name", fallback) for the addon name in messages
    addon_display_name = bl_info.get("name", "AI Audio Addon") 
    print(f"{addon_display_name}: PyTorch (torch) library not found. GPU/MPS acceleration and some models may not be available.")
    torch = None

 AUDIO_JOB_LOCK = threading.Lock()
 COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE = []
 CURRENTLY_PROCESSING_JOB_ID = None


 def get_addon_module_name():
    if __name__ == "__main__":
        # For direct script execution, use the filename as the module name
        return os.path.splitext(os.path.basename(__file__))[0]
    # When run as an addon, __name__ is the module name (or package.module)
    return __name__.split('.')[0]


 def solve_path(path): return bpy.path.abspath(path)

 def clean_filename(name):
    name = "".join(c if c.isalnum() or c in " ._-" else "_" for c in name)
    return name[:200]

 def clear_cuda_cache():
    print("Attempting to clear CUDA cache...")
    try:
        if torch and torch.cuda.is_available():
            torch.cuda.empty_cache()
            print("CUDA cache cleared.")
        elif torch: # PyTorch is available, but CUDA isn't
            print("CUDA not available, no GPU cache to clear via torch.cuda.")
        else: # PyTorch itself is not available
            print("PyTorch not available, cannot clear CUDA cache.")
    except Exception as e:
        print(f"Error in clear_cuda_cache: {e}")

 def print_elapsed_time(start_time): print(f"Elapsed: {time.time() - start_time:.2f}s")
 def low_vram(): return False

 def find_first_empty_channel(start_frame, end_frame):
    scene = bpy.context.scene
    if not scene.sequence_editor: return 1
    max_channel = 0
    sequences = scene.sequence_editor.sequences_all
    if not sequences: return 1
    for seq in sequences:
        if seq.channel > max_channel: max_channel = seq.channel
    for ch in range(1, max_channel + 3):
        occupied = False
        for seq in sequences:
            if seq.channel == ch:
                if max(start_frame, seq.frame_final_start) < min(end_frame, seq.frame_final_end):
                    occupied = True
                    break
        if not occupied: return ch
    return max_channel + 1

 # --- Audio Generation Task (Content identical to 1.0.24) ---
 # ... (The entire _generate_audio_task function from version 1.0.24 should be here)
 # ... (No changes needed in this function for the current AudioLDM2 error)
 # --- PASTE _generate_audio_task from version 1.0.24 here ---
 def _generate_audio_task(job_data):
    global COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE
    job_id = job_data['id']
    print(f"Starting audio generation for job: {job_id}, Prompt: {job_data['prompt'][:30]}...")
    
    _torch_thread = None
    try:
        import torch as _torch_thread
    except ImportError:
        with AUDIO_JOB_LOCK:
            COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE.append({'id': job_id, 'status': 'ERROR', 'filepath': None, 'error': "PyTorch not found in generation thread."})
        return

    import torchaudio 
    import scipy
    from scipy.io.wavfile import write as write_wav

    audio_model_card = job_data['audio_model_card']
    local_files_only = job_data['local_files_only']
    gfx_device = job_data['gfx_device'] 

    prompt_text = job_data['prompt'] 
    negative_prompt_text = job_data['negative_prompt']
    movie_num_inference_steps = job_data['inference_steps']
    movie_num_guidance = job_data['guidance_scale']
    audio_length_in_s = job_data['audio_length_s']
    seed = job_data['seed']
    
    parler_direction_prompt = job_data.get('parler_direction_prompt')
    input_movie_path = job_data.get('input_movie_path')
    input_image_path = job_data.get('input_image_path')

    output_filepath = job_data['output_filepath']
    
    pipe = None
    tokenizer = None
    
    try:
        if audio_model_card == "stabilityai/stable-audio-open-1.0":
            from diffusers import StableAudioPipeline
            repo_id = "stabilityai/stable-audio-open-1.0"
            pipe = StableAudioPipeline.from_pretrained(repo_id, torch_dtype=_torch_thread.float16, local_files_only=local_files_only)
            if low_vram(): pipe.enable_model_cpu_offload()
            else: pipe.to(gfx_device)

        elif audio_model_card == "WhisperSpeech":
            from whisperspeech.pipeline import Pipeline
            pipe = Pipeline(s2a_ref=job_data.get('whisper_model_ref', "collabora/whisperspeech:s2a-q4-small-en+pl.model"))

        elif audio_model_card in ("parler-tts/parler-tts-large-v1", "parler-tts/parler-tts-mini-v1"):
            from parler_tts import ParlerTTSForConditionalGeneration
            from transformers import AutoTokenizer
            
            print(f"Attempting to load Parler-TTS model: {audio_model_card} (standard loading)")
            try:
                pipe = ParlerTTSForConditionalGeneration.from_pretrained(
                    audio_model_card, 
                    local_files_only=local_files_only
                ).to(gfx_device)
                
                if gfx_device != "cpu": 
                    if _torch_thread.cuda.is_available() and _torch_thread.cuda.is_bf16_supported() and 'large' in audio_model_card:
                        print("Parler-TTS: Attempting bfloat16.")
                        try: pipe = pipe.to(dtype=_torch_thread.bfloat16)
                        except Exception as e_bf16: print(f"Parler-TTS: bfloat16 conversion failed: {e_bf16}")
                    elif hasattr(pipe, "half"): 
                        print("Parler-TTS: Attempting float16 (half precision).")
                        try: pipe = pipe.half()
                        except Exception as e_fp16: print(f"Parler-TTS: float16 conversion failed: {e_fp16}")
                
                tokenizer = AutoTokenizer.from_pretrained(
                    audio_model_card, 
                    local_files_only=local_files_only
                )
                print(f"Successfully loaded Parler-TTS model and tokenizer for {audio_model_card}")
            except Exception as e_parler:
                print(f"Error during Parler-TTS loading for {audio_model_card}: {e_parler}")
                raise 

        elif audio_model_card == "bark":
            import numpy as np 
            from bark.generation import generate_text_semantic, preload_models
            from bark.api import semantic_to_waveform
            from bark import SAMPLE_RATE 
            preload_models( text_use_small=True, coarse_use_small=True, fine_use_gpu=True, fine_use_small=True)

        elif audio_model_card == "MMAudio":
            import librosa 
            import mmaudio
            from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate, load_video, load_image, make_video, VideoInfo 
            from mmaudio.model.flow_matching import FlowMatching
            from mmaudio.model.networks import MMAudio, get_my_mmaudio
            from mmaudio.model.utils.features_utils import FeaturesUtils

            _torch_thread.backends.cuda.matmul.allow_tf32 = True
            _torch_thread.backends.cudnn.allow_tf32 = True
            
            _device = _torch_thread.device(gfx_device)
            _dtype = _torch_thread.bfloat16 if _device.type == 'cuda' else _torch_thread.float32

            _model_config_name = job_data.get('mmaudio_model_name', 'large_44k_v2')
            _model_config: ModelConfig = all_model_cfg[_model_config_name]
            _model_config.download_if_needed() 

            _scheduler_config = _model_config.seq_cfg
            _model: MMAudio = get_my_mmaudio(_model_config.model_name).to(_device, _dtype).eval()
            _model.load_weights(_torch_thread.load(_model_config.model_path, map_location=_device, weights_only=True))
            
            _feature_extractor = FeaturesUtils(
                tod_vae_ckpt=_model_config.vae_path,
                synchformer_ckpt=_model_config.synchformer_ckpt,
                enable_conditions=True,
                mode=_model_config.mode,
                bigvgan_vocoder_ckpt=_model_config.bigvgan_16k_path,
                need_vae_encoder=False
            ).to(_device, _dtype).eval()
            pipe = {'model': _model, 'feature_extractor': _feature_extractor, 'scheduler_config': _scheduler_config, 'model_config': _model_config, 'device': _device, 'dtype': _dtype}


        elif audio_model_card == "facebook/musicgen-stereo-melody-large":
            from transformers import pipeline as hf_pipeline 
            from transformers import set_seed
            pipe = hf_pipeline("text-to-audio", audio_model_card, device=gfx_device, torch_dtype=_torch_thread.float16, local_files_only=local_files_only)
        
        elif audio_model_card == "cvssp/audioldm2-large":
            from diffusers import AudioLDM2Pipeline
            pipe = AudioLDM2Pipeline.from_pretrained(audio_model_card, torch_dtype=_torch_thread.float16, local_files_only=local_files_only)
            if low_vram(): pipe.enable_model_cpu_offload()
            else: pipe.to(gfx_device)
        else:
            raise ValueError(f"Audio model card not supported or not found: {audio_model_card}")

        if _torch_thread.cuda.is_available() and gfx_device != "cpu":
            generator = _torch_thread.Generator(gfx_device).manual_seed(seed) if seed != 0 else None
        else:
            generator = _torch_thread.Generator("cpu").manual_seed(seed) if seed != 0 else None
        
        start_time_inference = time.time()

        if audio_model_card == "stabilityai/stable-audio-open-1.0":
            audio = pipe(
                prompt=prompt_text,
                negative_prompt=negative_prompt_text if negative_prompt_text else None,
                num_inference_steps=movie_num_inference_steps,
                guidance_scale=movie_num_guidance, 
                num_waveforms_per_prompt=1,
                generator=generator,
            ).audios[0]
            if audio.ndim == 1: audio = audio.unsqueeze(0)
            output_np = audio.cpu().numpy()
            sample_rate = pipe.config.sampling_rate
            write_wav(output_filepath, sample_rate, output_np.T)


        elif audio_model_card == "bark":
            import numpy as np 
            from bark import SAMPLE_RATE 
            from bark.generation import generate_text_semantic
            from bark.api import semantic_to_waveform

            rate = SAMPLE_RATE
            SPEAKER = job_data.get('bark_speaker_full_id', "v2/en_speaker_0")
            
            sentences = [s.strip() for s in prompt_text.replace("\n", " ").split('.') if s.strip()]
            if not sentences: sentences = [prompt_text]

            pieces = []
            silence_duration_samples = int(0.25 * rate) 
            silence_np = np.zeros(silence_duration_samples, dtype=np.float32)

            for sentence_chunk in sentences:
                if not sentence_chunk: continue
                semantic_tokens = generate_text_semantic(
                    sentence_chunk, history_prompt=SPEAKER, temp=0.7, min_eos_p=0.05 
                )
                audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER)
                pieces.append(audio_array)
                pieces.append(silence_np) 
            
            if pieces:
                if pieces and np.array_equal(pieces[-1], silence_np):
                    pieces.pop()
                if pieces: 
                    audio_bark = np.concatenate(pieces)
                    write_wav(output_filepath, rate, audio_bark) 
                else:
                    raise RuntimeError("Bark generated no audio after processing sentences.")
            else:
                 raise RuntimeError("Bark generated no audio pieces.")


        elif audio_model_card == "WhisperSpeech":
            speaker_path = job_data.get('whisper_speaker_path') 
            lang = job_data.get('whisper_lang', 'en')
            cps = job_data.get('whisper_cps', 15) 
            audio_tensor = pipe.generate(prompt_text, speaker=speaker_path, lang=lang, cps=cps) 
            torchaudio.save(output_filepath, audio_tensor.cpu(), pipe.fs) 


        elif audio_model_card in ("parler-tts/parler-tts-large-v1", "parler-tts/parler-tts-mini-v1"):
            input_ids = tokenizer(parler_direction_prompt, return_tensors="pt").input_ids.to(gfx_device)
            prompt_input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(gfx_device)
            generation = pipe.generate(
                input_ids=input_ids, 
                prompt_input_ids=prompt_input_ids,
                do_sample=True, 
                temperature=0.7, 
            ).to(_torch_thread.float32) 
            audio_arr = generation.cpu().numpy().squeeze()
            if audio_arr.ndim == 2 and audio_arr.shape[0] < audio_arr.shape[1]: 
                 audio_arr = audio_arr.T 
            write_wav(output_filepath, pipe.config.sampling_rate, audio_arr)


        elif audio_model_card == "facebook/musicgen-stereo-melody-large":
            from transformers import set_seed
            set_seed(seed) 
            max_tokens_for_duration = int(min(audio_length_in_s * 25, 1500)) 
            music = pipe(
                prompt_text,
                negative_prompt=negative_prompt_text if negative_prompt_text else None,
                guidance_scale=movie_num_guidance,
                forward_params={"max_new_tokens": max_tokens_for_duration} 
            )
            audio_data = music["audio"][0].T 
            write_wav(output_filepath, music["sampling_rate"], audio_data.cpu().numpy())


        elif audio_model_card == "cvssp/audioldm2-large":
            audio = pipe(
                prompt_text,
                negative_prompt=negative_prompt_text if negative_prompt_text else None,
                num_inference_steps=movie_num_inference_steps,
                audio_length_in_s=audio_length_in_s,
                guidance_scale=movie_num_guidance,
                generator=generator,
            ).audios[0]
            
            audio_to_write = audio.cpu().numpy()
            if audio_to_write.ndim == 2 and audio_to_write.shape[0] < audio_to_write.shape[1]: 
                audio_to_write = audio_to_write.T 
            
            sample_rate = getattr(pipe.scheduler.config, "sample_rate", 
                                  getattr(pipe.config, "sample_rate", 16000)) 
            write_wav(output_filepath, sample_rate, audio_to_write)


        elif audio_model_card == "MMAudio":
            _model = pipe['model']
            _feature_extractor = pipe['feature_extractor']
            _scheduler_config = pipe['scheduler_config']
            _model_config = pipe['model_config']
            _device = pipe['device']
            _dtype = pipe['dtype']

            _scheduler = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=movie_num_inference_steps)
            
            _video_data = None 
            _clip_frames_for_generate, _sync_frames_for_generate = None, None 
            _image_input_flag = False
            mmaudio_sr = getattr(_model_config, 'sr', 44100)

            _fps_num_from_job = job_data.get('fps_numerator', 24) 
            _fps_den_from_job = job_data.get('fps_denominator', 1.0)

            if abs(_fps_den_from_job - 1.0) < 0.00001:
                _fps_fraction_num_int = _fps_num_from_job
                _fps_fraction_den_int = 1
            elif abs(_fps_den_from_job - 1.001) < 0.00001: 
                _fps_fraction_num_int = int(round(_fps_num_from_job * 1000))
                _fps_fraction_den_int = 1001
            else: 
                combined_frac = Fraction(_fps_num_from_job / _fps_den_from_job).limit_denominator(10000)
                _fps_fraction_num_int = combined_frac.numerator
                _fps_fraction_den_int = combined_frac.denominator
            if _fps_fraction_den_int == 0: _fps_fraction_den_int = 1
            _fps_fraction = Fraction(_fps_fraction_num_int, _fps_fraction_den_int)
            print(f"MMAudio using FPS fraction: {_fps_fraction}")

            is_tta_case = not (input_movie_path and os.path.isfile(input_movie_path)) and \
                          not (input_image_path and os.path.isfile(input_image_path))

            if not is_tta_case: 
                if input_movie_path and os.path.isfile(input_movie_path):
                    _video_data = load_video(input_movie_path, audio_length_in_s) 
                    _clip_frames_for_generate = _video_data.clip_frames.unsqueeze(0).to(_device, _dtype)
                    _sync_frames_for_generate = _video_data.sync_frames.unsqueeze(0).to(_device, _dtype)
                    _scheduler_config.duration = _video_data.duration_sec
                elif input_image_path and os.path.isfile(input_image_path):
                    _image_data = load_image(input_image_path)
                    _clip_frames_for_generate = _image_data.clip_frames.unsqueeze(0).to(_device, _dtype)
                    _sync_frames_for_generate = _image_data.sync_frames.unsqueeze(0).to(_device, _dtype)
                    _video_data = VideoInfo.from_image_info(_image_data, audio_length_in_s, fps=_fps_fraction)
                    _scheduler_config.duration = audio_length_in_s
                    _image_input_flag = True
            else: 
                _scheduler_config.duration = audio_length_in_s
                dummy_frame_shape = (1, 3, 224, 224) 
                _dummy_frame_tensor_batched = _torch_thread.zeros(dummy_frame_shape, device=_device, dtype=_dtype)
                _squeezed_dummy_frame = _dummy_frame_tensor_batched.squeeze(0) 
                if not _video_data: 
                    _video_data = VideoInfo(all_frames=_squeezed_dummy_frame, 
                                           clip_frames=_squeezed_dummy_frame, 
                                           sync_frames=_squeezed_dummy_frame, 
                                           duration_sec=audio_length_in_s, 
                                           fps=_fps_fraction)

            _model.update_seq_lengths(_scheduler_config.latent_seq_len, _scheduler_config.clip_seq_len, _scheduler_config.sync_seq_len)
            
            with _torch_thread.no_grad():
                keyword_args = {
                    "feature_utils": _feature_extractor,
                    "net": _model,
                    "fm": _scheduler,
                    "rng": generator,
                    "cfg_strength": movie_num_guidance,
                    "image_input": _image_input_flag 
                }
                
                if is_tta_case:
                    if negative_prompt_text:
                        keyword_args["negative_prompt"] = [negative_prompt_text] 
                    generated_audio = generate(None, None, [prompt_text], **keyword_args)
                else: 
                    keyword_args["text_prompt"] = [prompt_text]
                    if negative_prompt_text:
                        keyword_args["negative_text"] = [negative_prompt_text]
                    if _clip_frames_for_generate is not None:
                        keyword_args["video_frames"] = _clip_frames_for_generate
                    if _sync_frames_for_generate is not None:
                        keyword_args["sync_frames"] = _sync_frames_for_generate
                    generated_audio = generate(**keyword_args)
            
            audio_output = generated_audio.float().cpu()[0] 
            target_sr = mmaudio_sr 
            print(f"MMAudio using target sample rate: {target_sr}")
            
            if output_filepath.endswith(".mp4"):
                 make_video(_video_data, output_filepath, audio_output, sampling_rate=target_sr)
                 print(f"MMAudio Saved video to {output_filepath}")
            else: 
                if audio_output.ndim == 1: audio_output = audio_output.unsqueeze(0) 
                torchaudio.save(output_filepath, audio_output, target_sr)
                print(f"MMAudio Saved audio to {output_filepath}")
        else:
             raise NotImplementedError(f"Model {audio_model_card} inference not fully implemented in threaded task.")

        print_elapsed_time(start_time_inference)
        
        with AUDIO_JOB_LOCK:
            COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE.append({'id': job_id, 'status': 'COMPLETED', 'filepath': output_filepath, 'error': None})

    except ModuleNotFoundError as e:
        error_message_detail = f"Module not found: {e.name}. Please install dependencies."
        print(f"ERROR in job {job_id}: {error_message_detail}")
        with AUDIO_JOB_LOCK:
            COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE.append({'id': job_id, 'status': 'ERROR', 'filepath': None, 'error': error_message_detail})
    except Exception as e:
        print(f"ERROR in job {job_id}: {e}")
        import traceback
        error_message_detail = f"{type(e).__name__}: {str(e)}. Check console."
        traceback.print_exc()
        with AUDIO_JOB_LOCK:
            COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE.append({'id': job_id, 'status': 'ERROR', 'filepath': None, 'error': error_message_detail})
    finally:
        del pipe
        if tokenizer is not None: del tokenizer
        if '_model' in locals() and audio_model_card == "MMAudio":
            del _model, _feature_extractor, _scheduler_config, _model_config 
        print(f"Finished audio generation task for job: {job_id}")


 # --- Property Group for Queue Items ---
 class AudioQueueItem(PropertyGroup):
    id: StringProperty(name="ID")
    name: StringProperty(name="Job Name", default="Audio Job")
    status: EnumProperty(
        name="Status",
        items=[
            ("PENDING", "Pending", "Waiting to be processed"),
            ("PROCESSING", "Processing", "Currently generating audio"),
            ("COMPLETED", "Completed", "Audio generated successfully"),
            ("ERROR", "Error", "An error occurred during generation"),
        ],
        default="PENDING",
    )
    prompt: StringProperty(name="Prompt", default="A beautiful melody", maxlen=1024)
    negative_prompt: StringProperty(name="Negative Prompt", maxlen=1024)
    inference_steps: IntProperty(name="Inference Steps", default=50, min=1, max=500)
    guidance_scale: FloatProperty(name="Guidance Scale", default=7.0, min=0.0, max=50.0)
    seed: IntProperty(name="Seed", default=0) 
    use_random_seed: BoolProperty(name="Use Random Seed", default=True)
    audio_length_s: FloatProperty(name="Audio Length (s)", default=10.0, min=0.1, max=600.0)
    
    output_filepath: StringProperty(name="Output Filepath", subtype='FILE_PATH', maxlen=1024)
    start_frame: IntProperty(name="Start Frame", default=1, min=1)
    channel: IntProperty(name="Channel", default=1, min=1)

    audio_model_card: StringProperty(name="Audio Model Card", maxlen=256)
    local_files_only: BoolProperty(name="Local Files Only") 
    gfx_device: StringProperty(name="Graphics Device", default="cpu", maxlen=32)

    bark_speaker_full_id: StringProperty(name="Bark Speaker ID", default="v2/en_speaker_0", maxlen=64)
    whisper_model_ref: StringProperty(name="WhisperSpeech Model Ref", default="collabora/whisperspeech:s2a-q4-small-en+pl.model", maxlen=256)
    whisper_speaker_path: StringProperty(name="WhisperSpeech Speaker Path (.pt)", subtype='FILE_PATH', maxlen=1024)
    whisper_lang: StringProperty(name="WhisperSpeech Language", default="en", maxlen=10)
    whisper_cps: IntProperty(name="WhisperSpeech CPS", default=15, min=5, max=30)
    parler_direction_prompt: StringProperty(name="Parler Direction Prompt", default="A clear, expressive voice speaking calmly.", maxlen=512)
    mmaudio_model_name: StringProperty(name="MMAudio Model Name", default="large_44k_v2", maxlen=128)
    input_movie_path: StringProperty(name="MMAudio Input Movie", subtype='FILE_PATH', maxlen=1024)
    input_image_path: StringProperty(name="MMAudio Input Image", subtype='FILE_PATH', maxlen=1024)
    
    fps_numerator: IntProperty(name="FPS Numerator", default=24)
    fps_denominator: FloatProperty(name="FPS Denominator", default=1.0) 
    
    error_message: StringProperty(name="Error Message", default="", maxlen=1024)

 # --- Scene Properties ---
 def register_scene_properties():
    bpy.types.Scene.audio_gen_prompt = StringProperty(name="Prompt", default="A catchy synthwave tune", maxlen=1024)
    bpy.types.Scene.audio_gen_negative_prompt = StringProperty(name="Negative Prompt", default="", maxlen=1024)
    bpy.types.Scene.audio_gen_steps = IntProperty(name="Inference Steps", default=50, min=10, max=500)
    bpy.types.Scene.audio_gen_guidance = FloatProperty(name="Guidance Scale", default=7.0, min=0.0, max=50.0)
    bpy.types.Scene.audio_gen_seed = IntProperty(name="Seed", default=12345, min=0) 
    bpy.types.Scene.audio_gen_random_seed = BoolProperty(name="Use Random Seed", default=True)
    bpy.types.Scene.audio_gen_length_s = FloatProperty(name="Duration (s)", default=10.0, min=1.0, max=600.0)
    bpy.types.Scene.audio_gen_output_dir = StringProperty(name="Output Directory", subtype='DIR_PATH', default="//audio_gen/", maxlen=1024)
    
    bpy.types.Scene.audio_gen_bark_speaker = StringProperty(name="Bark Speaker", default="v2/en_speaker_0", maxlen=64)
    bpy.types.Scene.audio_gen_parler_description = StringProperty(name="Parler Description", default="A clear, expressive voice speaking calmly.", maxlen=512)
    bpy.types.Scene.audio_gen_mmaudio_movie_input = StringProperty(name="MMAudio Movie Input", subtype='FILE_PATH', maxlen=1024)
    bpy.types.Scene.audio_gen_mmaudio_image_input = StringProperty(name="MMAudio Image Input", subtype='FILE_PATH', maxlen=1024)
    bpy.types.Scene.audio_gen_whisper_speaker_file = StringProperty(name="WhisperSpeech Speaker File (.pt)", subtype='FILE_PATH', maxlen=1024)

    bpy.types.Scene.audio_render_queue = CollectionProperty(type=AudioQueueItem)
    bpy.types.Scene.audio_render_queue_index = IntProperty()

 def unregister_scene_properties():
    props_to_delete = [
        "audio_gen_prompt", "audio_gen_negative_prompt", "audio_gen_steps",
        "audio_gen_guidance", "audio_gen_seed", "audio_gen_random_seed",
        "audio_gen_length_s", "audio_gen_output_dir", "audio_gen_bark_speaker",
        "audio_gen_parler_description", "audio_gen_mmaudio_movie_input",
        "audio_gen_mmaudio_image_input", "audio_gen_whisper_speaker_file",
        "audio_render_queue", "audio_render_queue_index"
    ]
    for prop_name in props_to_delete:
        if hasattr(bpy.types.Scene, prop_name):
            delattr(bpy.types.Scene, prop_name)

 # --- Operators ---
 class SEQUENCER_OT_add_to_audio_queue(Operator):
    bl_idname = "sequencer.add_to_audio_queue"
    bl_label = "Add to Audio Queue"
    bl_options = {"REGISTER", "UNDO"}

    @classmethod
    def poll(cls, context): return True 

    def execute(self, context):
        scene = context.scene
        addon_name = get_addon_module_name() 
        prefs = context.preferences.addons[addon_name].preferences

        if not scene.sequence_editor: scene.sequence_editor_create()

        item = scene.audio_render_queue.add()
        item.id = str(uuid.uuid4())
        item.prompt = scene.audio_gen_prompt
        item.name = item.prompt[:30] + "..." if len(item.prompt) > 30 else item.prompt
        item.negative_prompt = scene.audio_gen_negative_prompt
        item.inference_steps = scene.audio_gen_steps
        item.guidance_scale = scene.audio_gen_guidance
        item.use_random_seed = scene.audio_gen_random_seed
        if item.use_random_seed:
            item.seed = random.randint(0, 2**31 - 1) 
        else:
            item.seed = scene.audio_gen_seed 
        item.audio_length_s = scene.audio_gen_length_s
        
        item.start_frame = scene.frame_current
        item.channel = find_first_empty_channel(item.start_frame, item.start_frame + int(item.audio_length_s * (scene.render.fps / scene.render.fps_base) +1))

        item.audio_model_card = prefs.audio_model_card
        item.local_files_only = prefs.local_files_only
        
        item.gfx_device = "cpu" 
        if torch: 
            try:
                if prefs.execution_provider == "CUDAExecutionProvider":
                    if torch.cuda.is_available(): item.gfx_device = "cuda"
                    else: self.report({'WARNING'}, "CUDA selected, but not available. Using CPU.")
                elif prefs.execution_provider == "CoreMLExecutionProvider": 
                    if hasattr(torch, "backends") and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): item.gfx_device = "mps"
                    else: self.report({'WARNING'}, "CoreML/MPS selected, but not available. Using CPU.")
            except Exception as e: self.report({'WARNING'}, f"Error checking GPU devices: {e}. Using CPU.")
        elif prefs.execution_provider != "CPUExecutionProvider":
                self.report({'WARNING'}, "PyTorch not found. GPU execution unavailable. Using CPU.")
        
        if item.audio_model_card == "bark": item.bark_speaker_full_id = scene.audio_gen_bark_speaker
        elif item.audio_model_card == "WhisperSpeech": item.whisper_speaker_path = solve_path(scene.audio_gen_whisper_speaker_file) if scene.audio_gen_whisper_speaker_file else ""
        elif item.audio_model_card in ("parler-tts/parler-tts-large-v1", "parler-tts/parler-tts-mini-v1"): item.parler_direction_prompt = scene.audio_gen_parler_description
        elif item.audio_model_card == "MMAudio":
            item.input_movie_path = solve_path(scene.audio_gen_mmaudio_movie_input) if scene.audio_gen_mmaudio_movie_input else ""
            item.input_image_path = solve_path(scene.audio_gen_mmaudio_image_input) if scene.audio_gen_mmaudio_image_input else ""
            item.fps_numerator = scene.render.fps 
            item.fps_denominator = scene.render.fps_base 

        output_dir = solve_path(scene.audio_gen_output_dir)
        if not os.path.isdir(output_dir):
            try: os.makedirs(output_dir, exist_ok=True)
            except OSError as e:
                self.report({'ERROR'}, f"Could not create output directory: {output_dir} - {e}")
                scene.audio_render_queue.remove(len(scene.audio_render_queue)-1)
                return {'CANCELLED'}

        base_filename = clean_filename(f"{item.seed}_{item.prompt}")
        file_extension = ".wav"
        if item.audio_model_card == "MMAudio" and (item.input_movie_path or item.input_image_path):
            file_extension = ".mp4" 
        item.output_filepath = os.path.join(output_dir, f"{base_filename}{file_extension}")
        item.error_message = "" 

        self.report({"INFO"}, f"Added '{item.name}' to audio queue.")
        return {"FINISHED"}

 class SEQUENCER_OT_process_audio_queue(Operator):
    bl_idname = "sequencer.process_audio_queue"
    bl_label = "Process Queue" 
    bl_options = {"REGISTER"} 

    _timer = None
    _previous_model_card = None 

    @classmethod
    def poll(cls, context):
        has_pending = any(job.status == "PENDING" for job in context.scene.audio_render_queue)
        return has_pending and CURRENTLY_PROCESSING_JOB_ID is None

    def _start_job_thread(self, job_item):
        job_data = {k: getattr(job_item, k) for k in job_item.rna_type.properties.keys() if k != 'rna_type'}
        job_data['output_filepath'] = solve_path(job_item.output_filepath)
        if job_item.input_movie_path: job_data['input_movie_path'] = solve_path(job_item.input_movie_path)
        if job_item.input_image_path: job_data['input_image_path'] = solve_path(job_item.input_image_path)
        if job_item.whisper_speaker_path: job_data['whisper_speaker_path'] = solve_path(job_item.whisper_speaker_path)
        
        thread = threading.Thread(target=_generate_audio_task, args=(job_data,))
        thread.daemon = True
        thread.start()
        print(f"Processing thread started for job: {job_item.name}")

    def _start_next_pending_job(self, context):
        global CURRENTLY_PROCESSING_JOB_ID
        scene = context.scene
        
        next_job_to_process = None
        active_job_idx = -1
        for idx, job_in_q in enumerate(scene.audio_render_queue):
            if job_in_q.status == "PENDING":
                next_job_to_process = job_in_q
                active_job_idx = idx
                break
        
        if next_job_to_process:
            print(f"Preparing to start next pending job: {next_job_to_process.name}")
            
            if SEQUENCER_OT_process_audio_queue._previous_model_card is not None and \
               SEQUENCER_OT_process_audio_queue._previous_model_card != next_job_to_process.audio_model_card:
                print(f"Model changed from {SEQUENCER_OT_process_audio_queue._previous_model_card} to {next_job_to_process.audio_model_card}. Clearing VRAM.")
                clear_cuda_cache() 
            SEQUENCER_OT_process_audio_queue._previous_model_card = next_job_to_process.audio_model_card
            
            CURRENTLY_PROCESSING_JOB_ID = next_job_to_process.id
            next_job_to_process.status = "PROCESSING"
            next_job_to_process.error_message = ""
            if active_job_idx != -1:
                scene.audio_render_queue_index = active_job_idx

            self._start_job_thread(next_job_to_process)
            for area in context.screen.areas:
                if area.type == 'SEQUENCE_EDITOR' or area.ui_type == 'PROPERTIES':
                    area.tag_redraw()
            return True
        return False

    def execute(self, context):
        if self._start_next_pending_job(context):
            if SEQUENCER_OT_process_audio_queue._timer is None:
                SEQUENCER_OT_process_audio_queue._timer = bpy.context.window_manager.event_timer_add(0.5, window=context.window)
            context.window_manager.modal_handler_add(self)
            current_job_name = "Unknown"
            if 0 <= context.scene.audio_render_queue_index < len(context.scene.audio_render_queue):
                 current_job_name = context.scene.audio_render_queue[context.scene.audio_render_queue_index].name
            self.report({"INFO"}, f"Started processing job: {current_job_name}")
            return {"RUNNING_MODAL"}
        else:
            self.report({"INFO"}, "No pending jobs to process.")
            SEQUENCER_OT_process_audio_queue._previous_model_card = None 
            return {"CANCELLED"}

    def modal(self, context, event):
        global COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE, CURRENTLY_PROCESSING_JOB_ID
        scene = context.scene

        if event.type == 'TIMER':
            callbacks_to_process = []
            with AUDIO_JOB_LOCK:
                if COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE:
                    callbacks_to_process.extend(COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE)
                    COMPLETED_AUDIO_JOBS_CALLBACK_QUEUE.clear()
            
            job_just_finished_this_cycle = False
            for result in callbacks_to_process:
                job_id_completed = result['id']
                
                if CURRENTLY_PROCESSING_JOB_ID == job_id_completed: 
                    CURRENTLY_PROCESSING_JOB_ID = None 
                job_just_finished_this_cycle = True

                job_item = next((job for job in scene.audio_render_queue if job.id == job_id_completed), None)
                if not job_item:
                    print(f"Job ID {job_id_completed} not found in queue after completion.")
                    continue 
                
                job_item.status = result['status']
                if result['status'] == 'COMPLETED': 
                    job_item.output_filepath = solve_path(result['filepath']) 
                    print(f"Job '{job_item.name}' completed. Adding strip.")
                    self._add_strip_to_vse(context, job_item)
                else: 
                    error_msg = result.get('error', 'Unknown error')
                    print(f"Job '{job_item.name}' failed: {error_msg}")
                    job_item.error_message = error_msg 

                for area in context.screen.areas: 
                    if area.type == 'SEQUENCE_EDITOR' or area.ui_type == 'PROPERTIES':
                        area.tag_redraw()
            
            if job_just_finished_this_cycle: 
                if not self._start_next_pending_job(context): 
                    print("All jobs processed or queue processing stopped.")
                    if SEQUENCER_OT_process_audio_queue._timer:
                        bpy.context.window_manager.event_timer_remove(SEQUENCER_OT_process_audio_queue._timer)
                        SEQUENCER_OT_process_audio_queue._timer = None
                    SEQUENCER_OT_process_audio_queue._previous_model_card = None 
                    return {'FINISHED'}

        if event.type in {'RIGHTMOUSE', 'ESC'}:
            if SEQUENCER_OT_process_audio_queue._timer:
                bpy.context.window_manager.event_timer_remove(SEQUENCER_OT_process_audio_queue._timer)
                SEQUENCER_OT_process_audio_queue._timer = None
            if CURRENTLY_PROCESSING_JOB_ID:
                 bpy.ops.wm.report({'WARNING'}, message=f"Queue processing interrupted. Job {CURRENTLY_PROCESSING_JOB_ID} may finish in background.")
            else:
                 bpy.ops.wm.report({'INFO'}, message="Queue processing stopped.")
            SEQUENCER_OT_process_audio_queue._previous_model_card = None 
            return {'CANCELLED'}

        return {'PASS_THROUGH'}

    def _add_strip_to_vse(self, context, job_item):
        scene = context.scene
        seq_editor = scene.sequence_editor
        if not seq_editor: seq_editor = scene.sequence_editor_create()

        abs_filepath = solve_path(job_item.output_filepath)
        if not os.path.exists(abs_filepath):
            error_msg = f"Generated file not found: {abs_filepath}"
            job_item.status = "ERROR"
            job_item.error_message = error_msg
            print(error_msg)
            return

        try:
            if abs_filepath.lower().endswith(".mp4"):
                 new_strip = seq_editor.sequences.new_movie(
                    name=job_item.name, filepath=abs_filepath,
                    channel=job_item.channel, frame_start=job_item.start_frame,
                 )
            else: 
                new_strip = seq_editor.sequences.new_sound(
                    name=job_item.name, filepath=abs_filepath,
                    channel=job_item.channel, frame_start=job_item.start_frame,
                )
            
            if new_strip: seq_editor.active_strip = new_strip
            bpy.ops.wm.redraw_timer(type='DRAW_WIN_SWAP', iterations=1)
        except Exception as e:
            error_msg = f"Failed to add strip for {job_item.name}: {e}"
            job_item.status = "ERROR"
            job_item.error_message = error_msg
            print(error_msg)


 class SEQUENCER_OT_clear_audio_queue(Operator):
    bl_idname = "sequencer.clear_audio_queue"
    bl_label = "Clear Queue"
    bl_options = {"REGISTER", "UNDO"}

    @classmethod
    def poll(cls, context):
        return len(context.scene.audio_render_queue) > 0 and CURRENTLY_PROCESSING_JOB_ID is None

    def execute(self, context):
        if CURRENTLY_PROCESSING_JOB_ID is not None:
            self.report({"WARNING"}, "Cannot clear queue while a job is processing.")
            return {"CANCELLED"}
        context.scene.audio_render_queue.clear()
        context.scene.audio_render_queue_index = 0
        self.report({"INFO"}, "Audio queue cleared.")
        return {"FINISHED"}

 class SEQUENCER_OT_remove_from_audio_queue(Operator):
    bl_idname = "sequencer.remove_from_audio_queue"
    bl_label = "Remove Job"
    bl_options = {"REGISTER", "UNDO"}

    index: IntProperty(name="Job Index")

    @classmethod
    def poll(cls, context):
        return len(context.scene.audio_render_queue) > 0 and CURRENTLY_PROCESSING_JOB_ID is None

    def execute(self, context):
        scene = context.scene
        if not (0 <= self.index < len(scene.audio_render_queue)):
            self.report({'WARNING'}, "Invalid job index.")
            return {"CANCELLED"}
        
        job_to_remove = scene.audio_render_queue[self.index]
        if job_to_remove.status == "PROCESSING": 
            self.report({"WARNING"}, "Cannot remove a job marked as processing.")
            return {"CANCELLED"}

        scene.audio_render_queue.remove(self.index)
        if scene.audio_render_queue_index >= self.index and scene.audio_render_queue_index > 0:
            scene.audio_render_queue_index -=1
        elif not scene.audio_render_queue:
             scene.audio_render_queue_index = 0
        self.report({"INFO"}, f"Job '{job_to_remove.name}' removed from queue.")
        return {"FINISHED"}

 # --- UI Panel ---
 class SEQUENCER_PT_audio_generation_panel(Panel):
    bl_label = "AI Audio Generation"
    bl_space_type = "SEQUENCE_EDITOR"
    bl_region_type = "UI"
    bl_category = "AI Tools" 

    def draw(self, context):
        layout = self.layout
        scene = context.scene
        addon_name = get_addon_module_name()
        try:
            prefs = context.preferences.addons[addon_name].preferences
        except KeyError:
            layout.label(text=f"Addon '{addon_name}' preferences not found. Ensure it's installed and enabled correctly.", icon='ERROR')
            return
        
        layout.label(text="Audio Generation Settings:")
        col = layout.column(align=True)
        col.prop(prefs, "audio_model_card", text="Model")
        col.prop(scene, "audio_gen_prompt", text="Prompt")
        col.prop(scene, "audio_gen_negative_prompt", text="Negative")
        row = col.row(align=True); row.prop(scene, "audio_gen_steps", text="Steps"); row.prop(scene, "audio_gen_guidance", text="Guidance")
        row = col.row(align=True); sub = row.row(align=True); sub.prop(scene, "audio_gen_seed", text="Seed"); sub.enabled = not scene.audio_gen_random_seed; row.prop(scene, "audio_gen_random_seed", text="", icon='QUESTION')
        col.prop(scene, "audio_gen_length_s", text="Duration (s)")
        col.prop(scene, "audio_gen_output_dir", text="Output Dir")

        if prefs.audio_model_card == "bark":
            box = layout.box(); box.label(text="Bark Settings:"); box.prop(scene, "audio_gen_bark_speaker", text="Speaker")
        elif prefs.audio_model_card == "WhisperSpeech":
            box = layout.box(); box.label(text="WhisperSpeech Settings:"); box.prop(scene, "audio_gen_whisper_speaker_file", text="Speaker .pt")
        elif prefs.audio_model_card in ("parler-tts/parler-tts-large-v1", "parler-tts/parler-tts-mini-v1"):
            box = layout.box(); box.label(text="Parler TTS Settings:"); box.prop(scene, "audio_gen_parler_description", text="Description")
        elif prefs.audio_model_card == "MMAudio":
            box = layout.box(); box.label(text="MMAudio Settings:"); box.prop(scene, "audio_gen_mmaudio_movie_input", text="Movie Input"); box.prop(scene, "audio_gen_mmaudio_image_input", text="Image Input")

        layout.separator()
        layout.operator(SEQUENCER_OT_add_to_audio_queue.bl_idname, icon="ADD")
        layout.separator()
        layout.label(text="Audio Render Queue:")
        row = layout.row(align=True)
        
        process_op_text = "Process Queue"
        if CURRENTLY_PROCESSING_JOB_ID is not None or \
           (SEQUENCER_OT_process_audio_queue._timer is not None and any(j.status == "PENDING" for j in scene.audio_render_queue)):
            process_op_text = "Processing..."
        
        row.operator(SEQUENCER_OT_process_audio_queue.bl_idname, icon="PLAY", text=process_op_text)
        row.operator(SEQUENCER_OT_clear_audio_queue.bl_idname, icon="TRASH", text="Clear All")
        
        if CURRENTLY_PROCESSING_JOB_ID:
            processing_job_item = next((job for job in scene.audio_render_queue if job.id == CURRENTLY_PROCESSING_JOB_ID), None)
            if processing_job_item:
                layout.label(text=f"Active: {processing_job_item.name[:25]}...", icon='INFO')

        if not scene.audio_render_queue: layout.label(text="Queue is empty.")
        else:
            for i, item in enumerate(scene.audio_render_queue):
                box = layout.box()
                main_row = box.row()
                icon = 'FILE_SOUND'
                if item.status == "PROCESSING": icon = 'PLAY'
                elif item.status == "COMPLETED": icon = 'CHECKMARK'
                elif item.status == "ERROR": icon = 'ERROR'
                elif item.status == "PENDING": icon = 'TIME'
                main_row.label(text=f"{i+1}. {item.name}", icon=icon)
                main_row.label(text=f"({item.status})")
                op_row = main_row.row(align=True)
                
                op_rem_props = op_row.operator(SEQUENCER_OT_remove_from_audio_queue.bl_idname, text="", icon="X")
                if op_rem_props: 
                    op_rem_props.index = i
                
                if item.status == "COMPLETED":
                    box.prop(item, "output_filepath", text="File", emboss=False, icon='FILEBROWSER')
                elif item.status == "ERROR":
                    if item.error_message: 
                        box.label(text=item.error_message, icon='CANCEL')
                    else: 
                        box.label(text="Failed. Check console for details.", icon='CANCEL')

 # --- Addon Preferences ---
 class MyAddonPreferences(bpy.types.AddonPreferences):
    bl_idname = get_addon_module_name() 

    model_items = [
        ("stabilityai/stable-audio-open-1.0", "Stable Audio Open 1.0", "Stability AI text-to-audio"),
        ("bark", "Bark", "Suno AI text-to-speech"),
        ("WhisperSpeech", "WhisperSpeech", "Collabora text-to-speech"),
        ("parler-tts/parler-tts-large-v1", "Parler TTS Large", "Parler TTS Large model"),
        ("facebook/musicgen-stereo-melody-large", "MusicGen Stereo Large", "Facebook AI music generation"),
        ("cvssp/audioldm2-large", "AudioLDM2 Large", "Audio generation model"),
        ("MMAudio", "MMAudio", "Multimodal Audio generation"),
    ]
    audio_model_card: EnumProperty(name="Audio Model", items=model_items, default="stabilityai/stable-audio-open-1.0")
    local_files_only: BoolProperty(name="Use Local Files Only (Hugging Face)", default=False)
    exec_provider_items = [
        ("CPUExecutionProvider", "CPU", "Use CPU"),
        ("CUDAExecutionProvider", "CUDA (NVIDIA)", "Use NVIDIA GPU (requires PyTorch+CUDA)"),
        ("CoreMLExecutionProvider", "CoreML (Apple MPS)", "Use Apple Silicon GPU (requires PyTorch+MPS)"),
    ]
    execution_provider: EnumProperty(name="Execution Provider", items=exec_provider_items, default="CPUExecutionProvider")

    def draw(self, context):
        layout = self.layout
        layout.prop(self, "audio_model_card")
        layout.prop(self, "local_files_only")
        layout.prop(self, "execution_provider")
        layout.label(text="Note: Model dependencies (PyTorch, diffusers, etc.)")
        layout.label(text="must be installed into Blender's Python environment.")
        if not torch: layout.label(text="PyTorch (torch) library not found!", icon='ERROR')

 # --- Registration ---
 prop_group_classes = (AudioQueueItem,)
 other_classes = (
    MyAddonPreferences,
    SEQUENCER_OT_add_to_audio_queue,
    SEQUENCER_OT_process_audio_queue,
    SEQUENCER_OT_clear_audio_queue,
    SEQUENCER_OT_remove_from_audio_queue,
    SEQUENCER_PT_audio_generation_panel,
 )

 def register():
    for cls in prop_group_classes: bpy.utils.register_class(cls)
    register_scene_properties()
    for cls in other_classes: bpy.utils.register_class(cls)
    SEQUENCER_OT_process_audio_queue._previous_model_card = None 

 def unregister():
    for cls in reversed(other_classes): bpy.utils.unregister_class(cls)
    unregister_scene_properties()
    for cls in reversed(prop_group_classes): bpy.utils.unregister_class(cls)
    if SEQUENCER_OT_process_audio_queue._timer:
        try:
            if bpy.context.window_manager and bpy.context.window:
                 bpy.context.window_manager.event_timer_remove(SEQUENCER_OT_process_audio_queue._timer)
        except (AttributeError, RuntimeError) as e: print(f"Could not remove timer during unregister: {e}")
        SEQUENCER_OT_process_audio_queue._timer = None
    SEQUENCER_OT_process_audio_queue._previous_model_card = None 

 if __name__ == "__main__":
    try: unregister()
    except Exception as e: print(f"Error during pre-unregister: {e}")
    register()