Automating Screenshot Analysis Using GPT-4 and Text-to-Speech: A Step-by-Step Tutorial
Hello there! If you've ever found yourself constantly taking screenshots and wishing you had a way to automatically analyze and get feedback from them, you're in the right place. In this tutorial, we'll walk you through setting up a nifty little script that does just that—using the power of GPT-4 and Text-to-Speech (TTS).
By the end of this tutorial, you'll have a script that:
Monitors a folder for new screenshots.
Analyzes these screenshots using GPT-4.
Provides insights and feedback through a text file and an audio response.
This tutorial is tailored for macOS users, but the concepts can easily be adapted for other operating systems. So, let's dive in and get you set up!
Prerequisites
Python: Ensure you have Python installed. You can download it from python.org.
pip: Make sure you have pip installed to manage Python packages.
OpenAI API Key: Sign up for an OpenAI API key at OpenAI.
.env file: Create a
.env
file in your project directory with your OpenAI API key.
Step-by-Step Guide
Step 1: Initialize and Setup
First, let's initialize our environment and set up directories.
Create a new directory for your project and navigate into it:
mkdir screenshot_analyzer
cd screenshot_analyzer
Inside this directory, create the following subdirectories:
mkdir screenshots images audio responses
Now, create a .env
file in your project directory and add your OpenAI API key:
OPENAI_API_KEY=your_openai_api_key
Next, we'll install the required libraries. Open your terminal and run the following command:
pip install python-dotenv watchdog pillow requests simpleaudio openai
Create a new Python file named analyze_
screenshots.py
and add the following code:
import os
import time
from dotenv import load_dotenv
from datetime import datetime
# Load environment variables from .env file
load_dotenv()
# Set your OpenAI API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key not found in .env file")
# Ensure the necessary folders exist
for folder in ['screenshots', 'images', 'audio', 'responses']:
os.makedirs(folder, exist_ok=True)
def log(message):
print(f"[{datetime.now()}] {message}")
Step 2: Helper Functions
Next, let's create helper functions for encoding images, analyzing images, saving responses, and generating TTS.
Encode Image:
import base64
def encode_image(image_path):
log(f"Encoding image: {image_path}")
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
Analyze Image:
import requests
def analyze_image(image_path):
log(f"Analyzing image: {image_path}")
encoded_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What is in this image? If the image is a whiteboard or handwritten image, analyse the text and describe as concisely as possible using this context. If the image describes a system - describe the system. If the image has post-it notes, read each post-it note and understanding the themes and patterns present. Share with user your insights about the patterns you have observed. If there are questions present on the image try to infer the answer using the context from the image provided."},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}", "detail": "low"}}
]
}
],
"temperature": 0.5,
"max_tokens": 300
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
log(f"OpenAI response status: {response.status_code}")
if response.status_code != 200:
log(f"OpenAI response error: {response.text}")
return response.json()
Save Response:
import json
def save_response(image_path, analysis_json):
log(f"Saving response for image: {image_path}")
base_name = os.path.basename(image_path)
name, _ = os.path.splitext(base_name)
response_dir = os.path.join('responses', name)
os.makedirs(response_dir, exist_ok=True)
analysis_text = analysis_json['choices'][0]['message']['content'] if 'choices' in analysis_json and len(analysis_json['choices']) > 0 else "No analysis available."
with open(os.path.join(response_dir, f'{name}.txt'), 'w') as f:
f.write(f'Response for {base_name}\n\n')
f.write('Analysis:\n')
f.write(analysis_text)
f.write('\n\n')
# Save the full JSON response as well
with open(os.path.join(response_dir, f'{name}_response.json'), 'w') as f:
json.dump(analysis_json, f, indent=2)
return analysis_text
Generate and Play TTS:
import simpleaudio as sa
import openai
def generate_and_play_tts(text, audio_path):
log(f"Generating TTS for text: {text}")
response = openai.audio.speech.create(
model="tts-1",
voice="alloy",
input=text,
response_format="wav"
)
# Stream the response to a file
with open(audio_path, 'wb') as audio_file:
for chunk in response.iter_bytes():
audio_file.write(chunk)
log(f"TTS audio saved: {audio_path}")
# Play the audio file
wave_obj = sa.WaveObject.from_wave_file(audio_path)
play_obj = wave_obj.play()
play_obj.wait_done()
log("TTS audio played")
Step 3: Main Task Function
Now, we combine these functions to create the main task function.
from PIL import Image
def perform_task(image_path):
log(f"Function started: perform_task for {image_path}")
# Wait for the file to be fully saved and stable
time.sleep(5)
log(f"Processing file: {image_path}")
# Open the screenshot
try:
screenshot = Image.open(image_path)
log(f"Screenshot opened: {image_path}")
except Exception as e:
log(f"Error opening image: {e}")
return
# Convert the image to RGB if it is in RGBA mode
if screenshot.mode == 'RGBA':
screenshot = screenshot.convert('RGB')
# Save the screenshot as a JPEG in the "images" folder
timestamp = int(time.time())
processed_image_path = f'images/screenshot_{timestamp}.jpg'
try:
screenshot.save(processed_image_path, 'JPEG')
log(f"Saved processed image: {processed_image_path}")
except Exception as e:
log(f"Error saving processed image: {e}")
return
# Analyze the image
try:
result = analyze_image(processed_image_path)
except Exception as e:
log(f"Error analyzing image: {e}")
return
# Save the response and generate TTS
try:
analysis_text = save_response(processed_image_path, result)
audio_path = f'audio/response_{timestamp}.wav'
generate_and_play_tts(analysis_text, audio_path)
except Exception as e:
log(f"Error saving response or generating TTS: {e}")
Step 4: File System Event Handler
We create a class to handle file system events and trigger the task function.
from watchdog.events import FileSystemEventHandler
class FileChangeHandler(FileSystemEventHandler):
def __init__(self, file_extensions):
self.file_extensions = [ext.lower() for ext in file_extensions]
def _is_valid_extension(self, src_path):
return any(src_path.lower().endswith(ext) for ext in self.file_extensions)
def on_created(self, event):
if not event.is_directory and self._is_valid_extension(event.src_path):
log(f"File created: {event.src_path}")
perform_task(event.src_path)
def on_modified(self, event):
if not event.is_directory and self._is_valid_extension(event.src_path):
log(f"File modified: {event.src_path}")
perform_task(event.src_path)
def on_deleted(self, event):
if not event.is_directory and self._is_valid_extension(event.src_path):
log(f"File deleted: {event.src_path}")
Step 5: Main Execution
Finally, we set up the main execution to monitor the specified directory.
from watchdog.observers import Observer
if __name__ == "__main__":
# Set the path to the screenshots folder on the Desktop
user_home = os.path.expanduser("~")
path = os.path.join(user_home, "Desktop", "screenshots")
log(f"Monitoring path: {path}")
# Specify the file extensions you want to monitor
file_extensions = [".png", ".PNG", ".jpg", ".JPG", ".webp", ".WEBP"]
if not os.path.exists(path):
log(f"Path does not exist: {path}")
else:
event_handler = FileChangeHandler(file_extensions)
observer = Observer()
observer.schedule(event_handler, path, recursive=False)
observer.start()
try:
log("Monitoring screenshots folder for new files. Press Ctrl+C to stop.")
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
And there you have it! Your very own screenshot analyzer script. Now, whenever you take a screenshot and save it in the specified folder, the script will automatically analyze it using GPT-4, generate a text response, and even read it out loud for you.
Give it a go.
Subscribe to my newsletter
Read articles from Chris directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by