Window Monitoring with Machine Vision
Published: March 25, 2024 | Last Modified: May 13, 2025
Tags: python machine-learning computer-vision monitoring moondream
Categories: Python Machine Learning
Github Repository: Window Watcher
Server-Side
Setup Environment
I am using the moondream2 model for fast inference.
Model Repository: Moondream on GitHub
Take the server.py script from this repo: Github Repository: Window Watcher
git clone https://github.com/vikhyat/moondream.git
cd moondream
python -m venv venv
venv\Scripts\activate
cd ../
pip install -r requirements.txt
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install flask
Confirm CUDA is working, or else the script will fall back onto the GPU.
nvcc --version
Server-Side Script
from flask import Flask, request, jsonify
from PIL import Image
from io import BytesIO
import argparse
import torch
import re
import time
import gradio as gr
from moondream import detect_device, LATEST_REVISION
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
app = Flask(__name__)
parser = argparse.ArgumentParser()
parser.add_argument("--cpu", action="store_true")
args = parser.parse_args()
if args.cpu:
device = torch.device("cpu")
dtype = torch.float32
else:
device, dtype = detect_device()
if device != torch.device("cpu"):
print("Using device:", device)
print("If you run into issues, pass the `--cpu` flag to this script.")
print()
# Initialize the model
model_id = "vikhyatk/moondream2"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=LATEST_REVISION)
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=LATEST_REVISION
).to(device=device, dtype=dtype)
moondream.eval()
@app.route('/itt', methods=['POST'])
def get_answer():
if 'image' not in request.files or 'prompt' not in request.form:
return jsonify({"error": "Missing image file or prompt"}), 400
image_file = request.files['image']
prompt = request.form['prompt']
image = Image.open(BytesIO(image_file.read()))
# Ensure image size is optimal for the model
# image = image.resize((optimal_width, optimal_height))
image_embeds = moondream.encode_image(image)
answer = moondream.answer_question(image_embeds, prompt, tokenizer)
return jsonify({"text": answer})
if __name__ == "__main__":
# Disable debug for production
app.run(debug=True)
Client-Side
POST http://127.0.0.1:5000/itt