Window Watcher
Overview
Github Repository: Window Watcher
Server-Side
Setup Environment
I am using the moondream2 model for fast inference.
Model Repository: Moondream on GitHub
Take the server.py script from this repo: Github Repository: Window Watcher
1git clone https://github.com/vikhyat/moondream.git
2cd moondream
3python -m venv venv
4venv\Scripts\activate
5cd ../
6pip install -r requirements.txt
7pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
8pip install flask
Confirm CUDA is working, or else the script will fall back onto the GPU.
1nvcc --version
Server-Side Script
1from flask import Flask, request, jsonify
2from PIL import Image
3from io import BytesIO
4
5import argparse
6import torch
7import re
8import time
9import gradio as gr
10from moondream import detect_device, LATEST_REVISION
11from threading import Thread
12from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
13
14app = Flask(__name__)
15
16parser = argparse.ArgumentParser()
17parser.add_argument("--cpu", action="store_true")
18args = parser.parse_args()
19
20if args.cpu:
21 device = torch.device("cpu")
22 dtype = torch.float32
23else:
24 device, dtype = detect_device()
25 if device != torch.device("cpu"):
26 print("Using device:", device)
27 print("If you run into issues, pass the `--cpu` flag to this script.")
28 print()
29
30# Initialize the model
31model_id = "vikhyatk/moondream2"
32tokenizer = AutoTokenizer.from_pretrained(model_id, revision=LATEST_REVISION)
33moondream = AutoModelForCausalLM.from_pretrained(
34 model_id, trust_remote_code=True, revision=LATEST_REVISION
35).to(device=device, dtype=dtype)
36moondream.eval()
37
38@app.route('/itt', methods=['POST'])
39def get_answer():
40 if 'image' not in request.files or 'prompt' not in request.form:
41 return jsonify({"error": "Missing image file or prompt"}), 400
42
43 image_file = request.files['image']
44 prompt = request.form['prompt']
45
46 image = Image.open(BytesIO(image_file.read()))
47
48 # Ensure image size is optimal for the model
49 # image = image.resize((optimal_width, optimal_height))
50
51 image_embeds = moondream.encode_image(image)
52
53 answer = moondream.answer_question(image_embeds, prompt, tokenizer)
54
55 return jsonify({"text": answer})
56
57if __name__ == "__main__":
58 # Disable debug for production
59 app.run(debug=True)
Client-Side
POST http://127.0.0.1:5000/itt