pfn · April 1, 2026 15:37
diff --git a/filter-function.py b/filter-function.py
 """
 title: Real-time PP and TG metrics
 author: pfn0
 author_url: https://github.com/pfn
 funding_url: https://github.com/pfn
 version: 0.9
 """

 from pydantic import BaseModel, Field
 import traceback
 import tiktoken
 import logging
 import time

 CHIP_HTML = """
 <div id="token-metrics-pptg-chip" class="absolute -top-12 left-0 right-0 flex  z-30 pointer-events-none">
  <button class="text-xs px-3 bg-white border border-gray-100 dark:border-none dark:bg-white/20 p-1.5 rounded-full pointer-events-auto">
    Generating...
  </button>
 </div>
 """.replace("\n", "").replace("'", '"')
 ENCODING = tiktoken.get_encoding("cl100k_base")


 def num_tokens_from_string(text: str) -> int:
    """Count the number of tokens in a string."""
    # use r50k_base as a common default, not going to do heuristics to guess at which tokenizer to use
    if not text:
        return 0
    return len(ENCODING.encode(text))


 class Filter:
    class Valves(BaseModel):
        priority: int = Field(
            default=999, description="Filter execution order. Lower values run first."
        )

    def __init__(self):
        self.valves = self.Valves()
        self.logger = logging.getLogger("pptg_metrics")
        self.chat_stats = {}
        self.ewma_alpha = 0.6  # EWMA smoothing factor

    async def inlet(
        self,
        body: dict,
        __event_emitter__,
        __metadata__: dict = None,
    ) -> dict:
        if "stream" in body and body["stream"]:
            self.chat_stats[__metadata__["chat_id"]] = {
                "start_time": time.time(),
                "last_update": time.time(),
                "tg": 0,
                # Removed: last_stream_time - now set when generation starts
                "last_token_count": 0,  # Cumulative tokens at last update
                "ewma_tg": None,  # EWMA token rate (None until first calculation)
                "ewma_initialized": False,  # Track if we've done first EWMA calculation
            }
            await __event_emitter__(
                {
                    "type": "status",
                    "data": {"description": "Processing prompt...", "done": False},
                }
            )
        return body

    async def stream(self, event: dict, __event_emitter__, __metadata__) -> dict:
        try:
            if not isinstance(event, dict):
                return event
            now = time.time()
            if not __metadata__["chat_id"] in self.chat_stats:
                return event
            chat_stats = self.chat_stats[__metadata__["chat_id"]]
            if (
                "ttft" not in chat_stats
            ):  # streaming means tokens are incoming, even if not counted because of thinking
                chat_stats["ttft"] = now - chat_stats["start_time"]
            content = None
            try:
                if "choices" in event and len(event["choices"]) > 0:
                    choice = event["choices"][0]
                    if "delta" in choice and "content" in choice["delta"]:
                        content = choice["delta"]["content"]
            except Exception as e:
                self.logger.error(f"Stream ex: {e}\n{traceback.format_exc()}")
                self.logger.error(f"Stream event: {event}")
            if not content and "thinking_start" not in chat_stats:
                chat_stats["thinking_start"] = now
            if content:
                if "generation_start" not in chat_stats:
                    chat_stats["generation_start"] = now
                    # FIX: Initialize last_stream_time when generation starts, not in inlet()
                    chat_stats["last_stream_time"] = now
                chat_stats["tg"] += num_tokens_from_string(content)
            if not "generation_start" in chat_stats:
                return event
            gen_time = now - chat_stats["generation_start"]
            chat_stats["gen_time"] = gen_time
            if gen_time < 0.001:  # is first stream event, do this once
                await __event_emitter__(
                    {
                        "type": "status",
                        "data": {
                            "description": f"Prompt processed in {chat_stats['ttft']:.1f}s, generating response...",
                            "done": False,
                        },
                    }
                )
                await __event_emitter__(
                    {
                        "type": "execute",
                        "data": {"code": f"""
                    (function() {{
                        let chip = document.getElementById("token-metrics-pptg-chip");
                        if (chip) {{
                            chip.querySelector("button").textContent = "Generating...";
                        }} else {{
                            let chipbar = document.querySelector("div.w-full.font-primary > div > div > div.relative")
                            chipbar.insertAdjacentHTML('beforeend', '{CHIP_HTML}');
                        }}
                    }})();
                """},
                    }
                )
                return event
            # Calculate EWMA token rate every 0.5s
            if now - chat_stats["last_update"] > 0.50:
                chat_stats["last_update"] = now
                # Calculate time interval since last stream invocation
                time_delta = now - chat_stats["last_stream_time"]
                chat_stats["last_stream_time"] = now
                # Calculate tokens received since last update
                tokens_delta = chat_stats["tg"] - chat_stats["last_token_count"]
                chat_stats["last_token_count"] = chat_stats["tg"]
                # Calculate instantaneous rate for this interval
                if time_delta > 0:
                    instantaneous_rate = tokens_delta / time_delta
                else:
                    instantaneous_rate = 0
                # Apply EWMA smoothing
                if not chat_stats["ewma_initialized"]:
                    # First calculation: use instantaneous rate as-is
                    chat_stats["ewma_tg"] = instantaneous_rate
                    chat_stats["ewma_initialized"] = True
                else:
                    # Subsequent calculations: apply EWMA formula
                    # EWMA = alpha * new_value + (1 - alpha) * old_value
                    chat_stats["ewma_tg"] = (self.ewma_alpha * instantaneous_rate) + (
                        (1 - self.ewma_alpha) * chat_stats["ewma_tg"]
                    )
                # Update display with EWMA rate
                if gen_time > 0.001:
                    tg = chat_stats["ewma_tg"]
                    await __event_emitter__(
                        {
                            "type": "execute",
                            "data": {"code": f"""
                            (function() {{
                                let chipContainer = document.getElementById("token-metrics-pptg-chip");
                                if (!chipContainer) {{
                                    let chipbar = document.querySelector("div.w-full.font-primary > div > div > div.relative")
                                    chipbar.insertAdjacentHTML('beforeend', '{CHIP_HTML}');
                                }}
                                let chip = document.getElementById("token-metrics-pptg-chip").querySelector("button");
                                chip.textContent = "Generating {tg:.1f} t/s";
                            }})();
                        """},
                        }
                    )
        except Exception as e:
            self.logger.error(f"stream error: {e}\n{traceback.format_exc()}")
        return event

    async def outlet(self, body: dict, __event_emitter__, __metadata__) -> dict:
        try:
            chat_stats = self.chat_stats[__metadata__["chat_id"]]
            tg = None
            result_str = "Response complete"
            if "ttft" in chat_stats:
                result_str = f"Prompt processed in {chat_stats['ttft']:.1f}s"
            gen_time = None
            if (
                chat_stats
                and "tg" in chat_stats
                and "gen_time" in chat_stats
                and chat_stats["gen_time"] > 0.001
            ):
                gen_time = chat_stats["gen_time"]
                tg = chat_stats["tg"] / gen_time
                result_str = f"Generated {chat_stats['tg']} tokens in {chat_stats['gen_time']:.1f}s at {tg:.1f} t/s"
            if (
                "usage" in body["messages"][-1]
            ):  # usage is oddly nested in the last message in body, rather than being top-level
                usage = body["messages"][-1]["usage"]
                if (
                    usage
                    and "predicted_n" in usage
                    and "predicted_ms" in usage
                    and "predicted_per_second" in usage
                ):
                    result_str = f"Generated {usage['predicted_n']} tokens in {usage['predicted_ms'] / 1000:.1f}s at {usage['predicted_per_second']:.1f} t/s"
                elif usage and gen_time and "completion_tokens" in usage:
                    if "thinking_start" in chat_stats:
                        gen_time += (
                            chat_stats["generation_start"]
                            - chat_stats["thinking_start"]
                        )
                    tokens = usage["completion_tokens"]
                    gen_time = max(0.001, gen_time)
                    result_str = f"Generated {tokens} tokens in {gen_time:.1f}s at {tokens / gen_time:.1f} t/s"
            await __event_emitter__(
                {
                    "type": "execute",
                    "data": {"code": f"""
                    (function() {{
                        let chip = document.getElementById("token-metrics-pptg-chip");
                        if (chip) chip.parentNode.removeChild(chip);
                    }})();
                """},
                }
            )
            await __event_emitter__(
                {
                    "type": "status",
                    "data": {
                        "description": result_str,
                        "done": True,
                    },
                }
            )
        finally:
            del self.chat_stats[__metadata__["chat_id"]]
        return body
	"""
	title: Real-time PP and TG metrics
	author: pfn0
	author_url: https://github.com/pfn
	funding_url: https://github.com/pfn
	version: 0.9
	"""

	from pydantic import BaseModel, Field
	import traceback
	import tiktoken
	import logging
	import time

	CHIP_HTML = """
	<div id="token-metrics-pptg-chip" class="absolute -top-12 left-0 right-0 flex z-30 pointer-events-none">
	<button class="text-xs px-3 bg-white border border-gray-100 dark:border-none dark:bg-white/20 p-1.5 rounded-full pointer-events-auto">
	Generating...
	</button>
	</div>
	""".replace("\n", "").replace("'", '"')
	ENCODING = tiktoken.get_encoding("cl100k_base")


	def num_tokens_from_string(text: str) -> int:
	"""Count the number of tokens in a string."""
	# use r50k_base as a common default, not going to do heuristics to guess at which tokenizer to use
	if not text:
	return 0
	return len(ENCODING.encode(text))


	class Filter:
	class Valves(BaseModel):
	priority: int = Field(
	default=999, description="Filter execution order. Lower values run first."
	)

	def __init__(self):
	self.valves = self.Valves()
	self.logger = logging.getLogger("pptg_metrics")
	self.chat_stats = {}
	self.ewma_alpha = 0.6 # EWMA smoothing factor

	async def inlet(
	self,
	body: dict,
	__event_emitter__,
	__metadata__: dict = None,
	) -> dict:
	if "stream" in body and body["stream"]:
	self.chat_stats[__metadata__["chat_id"]] = {
	"start_time": time.time(),
	"last_update": time.time(),
	"tg": 0,
	# Removed: last_stream_time - now set when generation starts
	"last_token_count": 0, # Cumulative tokens at last update
	"ewma_tg": None, # EWMA token rate (None until first calculation)
	"ewma_initialized": False, # Track if we've done first EWMA calculation
	}
	await __event_emitter__(
	{
	"type": "status",
	"data": {"description": "Processing prompt...", "done": False},
	}
	)
	return body

	async def stream(self, event: dict, __event_emitter__, __metadata__) -> dict:
	try:
	if not isinstance(event, dict):
	return event
	now = time.time()
	if not __metadata__["chat_id"] in self.chat_stats:
	return event
	chat_stats = self.chat_stats[__metadata__["chat_id"]]
	if (
	"ttft" not in chat_stats
	): # streaming means tokens are incoming, even if not counted because of thinking
	chat_stats["ttft"] = now - chat_stats["start_time"]
	content = None
	try:
	if "choices" in event and len(event["choices"]) > 0:
	choice = event["choices"][0]
	if "delta" in choice and "content" in choice["delta"]:
	content = choice["delta"]["content"]
	except Exception as e:
	self.logger.error(f"Stream ex: {e}\n{traceback.format_exc()}")
	self.logger.error(f"Stream event: {event}")
	if not content and "thinking_start" not in chat_stats:
	chat_stats["thinking_start"] = now
	if content:
	if "generation_start" not in chat_stats:
	chat_stats["generation_start"] = now
	# FIX: Initialize last_stream_time when generation starts, not in inlet()
	chat_stats["last_stream_time"] = now
	chat_stats["tg"] += num_tokens_from_string(content)
	if not "generation_start" in chat_stats:
	return event
	gen_time = now - chat_stats["generation_start"]
	chat_stats["gen_time"] = gen_time
	if gen_time < 0.001: # is first stream event, do this once
	await __event_emitter__(
	{
	"type": "status",
	"data": {
	"description": f"Prompt processed in {chat_stats['ttft']:.1f}s, generating response...",
	"done": False,
	},
	}
	)
	await __event_emitter__(
	{
	"type": "execute",
	"data": {"code": f"""
	(function() {{
	let chip = document.getElementById("token-metrics-pptg-chip");
	if (chip) {{
	chip.querySelector("button").textContent = "Generating...";
	}} else {{
	let chipbar = document.querySelector("div.w-full.font-primary > div > div > div.relative")
	chipbar.insertAdjacentHTML('beforeend', '{CHIP_HTML}');
	}}
	}})();
	"""},
	}
	)
	return event
	# Calculate EWMA token rate every 0.5s
	if now - chat_stats["last_update"] > 0.50:
	chat_stats["last_update"] = now
	# Calculate time interval since last stream invocation
	time_delta = now - chat_stats["last_stream_time"]
	chat_stats["last_stream_time"] = now
	# Calculate tokens received since last update
	tokens_delta = chat_stats["tg"] - chat_stats["last_token_count"]
	chat_stats["last_token_count"] = chat_stats["tg"]
	# Calculate instantaneous rate for this interval
	if time_delta > 0:
	instantaneous_rate = tokens_delta / time_delta
	else:
	instantaneous_rate = 0
	# Apply EWMA smoothing
	if not chat_stats["ewma_initialized"]:
	# First calculation: use instantaneous rate as-is
	chat_stats["ewma_tg"] = instantaneous_rate
	chat_stats["ewma_initialized"] = True
	else:
	# Subsequent calculations: apply EWMA formula
	# EWMA = alpha * new_value + (1 - alpha) * old_value
	chat_stats["ewma_tg"] = (self.ewma_alpha * instantaneous_rate) + (
	(1 - self.ewma_alpha) * chat_stats["ewma_tg"]
	)
	# Update display with EWMA rate
	if gen_time > 0.001:
	tg = chat_stats["ewma_tg"]
	await __event_emitter__(
	{
	"type": "execute",
	"data": {"code": f"""
	(function() {{
	let chipContainer = document.getElementById("token-metrics-pptg-chip");
	if (!chipContainer) {{
	let chipbar = document.querySelector("div.w-full.font-primary > div > div > div.relative")
	chipbar.insertAdjacentHTML('beforeend', '{CHIP_HTML}');
	}}
	let chip = document.getElementById("token-metrics-pptg-chip").querySelector("button");
	chip.textContent = "Generating {tg:.1f} t/s";
	}})();
	"""},
	}
	)
	except Exception as e:
	self.logger.error(f"stream error: {e}\n{traceback.format_exc()}")
	return event

	async def outlet(self, body: dict, __event_emitter__, __metadata__) -> dict:
	try:
	chat_stats = self.chat_stats[__metadata__["chat_id"]]
	tg = None
	result_str = "Response complete"
	if "ttft" in chat_stats:
	result_str = f"Prompt processed in {chat_stats['ttft']:.1f}s"
	gen_time = None
	if (
	chat_stats
	and "tg" in chat_stats
	and "gen_time" in chat_stats
	and chat_stats["gen_time"] > 0.001
	):
	gen_time = chat_stats["gen_time"]
	tg = chat_stats["tg"] / gen_time
	result_str = f"Generated {chat_stats['tg']} tokens in {chat_stats['gen_time']:.1f}s at {tg:.1f} t/s"
	if (
	"usage" in body["messages"][-1]
	): # usage is oddly nested in the last message in body, rather than being top-level
	usage = body["messages"][-1]["usage"]
	if (
	usage
	and "predicted_n" in usage
	and "predicted_ms" in usage
	and "predicted_per_second" in usage
	):
	result_str = f"Generated {usage['predicted_n']} tokens in {usage['predicted_ms'] / 1000:.1f}s at {usage['predicted_per_second']:.1f} t/s"
	elif usage and gen_time and "completion_tokens" in usage:
	if "thinking_start" in chat_stats:
	gen_time += (
	chat_stats["generation_start"]
	- chat_stats["thinking_start"]
	)
	tokens = usage["completion_tokens"]
	gen_time = max(0.001, gen_time)
	result_str = f"Generated {tokens} tokens in {gen_time:.1f}s at {tokens / gen_time:.1f} t/s"
	await __event_emitter__(
	{
	"type": "execute",
	"data": {"code": f"""
	(function() {{
	let chip = document.getElementById("token-metrics-pptg-chip");
	if (chip) chip.parentNode.removeChild(chip);
	}})();
	"""},
	}
	)
	await __event_emitter__(
	{
	"type": "status",
	"data": {
	"description": result_str,
	"done": True,
	},
	}
	)
	finally:
	del self.chat_stats[__metadata__["chat_id"]]
	return body
No results found