How I built DiddyEye - a Telegram bot that uses NLP and computer vision to detect misinformation and AI-generated images in real-time, complete with reliability scoring and source verification.
Use the audio player below to listen to this article. You can customize the voice and reading speed with the settings button.
In an era where misinformation spreads faster than wildfire through private messaging apps, I built DiddyEye - an AI-driven Telegram bot that acts as your personal fact-checker, detecting both false claims and AI-generated images in real-time.
The problem is massive: False information influences public opinion, elections, and even public health decisions. Traditional fact-checking is too slow for the speed of modern misinformation. We needed an automated solution that could work at the speed of messaging.
Modern misinformation isn't just false text - it's sophisticated, combining misleading claims with AI-generated "evidence" images. DiddyEye tackles both:
Real-time fact-checking using NLP and source verification
Computer vision to identify manipulated visual content
Let's dive into how each works.
The first challenge: How do you fact-check a claim when you don't know what to search for?
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
class ClaimAnalyzer:
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.tfidf = TfidfVectorizer(
max_features=10,
stop_words='english',
ngram_range=(1, 2)
)
def extract_keywords(self, claim):
# Remove noise and extract meaningful terms
tokens = nltk.word_tokenize(claim.lower())
filtered_tokens = [
token for token in tokens
if token.isalnum() and token not in self.stop_words
]
# Use TF-IDF to identify most important terms
tfidf_matrix = self.tfidf.fit_transform([' '.join(filtered_tokens)])
feature_names = self.tfidf.get_feature_names_out()
return feature_names[:5] # Top 5 keywords
This extracts the most relevant terms from user claims, focusing on entities, facts, and context rather than filler words.
Raw keyword search isn't enough - we need to find credible sources and extract meaningful content:
import requests
from bs4 import BeautifulSoup
import PyPDF2
from urllib.parse import urljoin, urlparse
class ContentScraper:
def __init__(self, google_api_key, search_engine_id):
self.api_key = google_api_key
self.search_engine_id = search_engine_id
def search_credible_sources(self, keywords):
# Use Google Custom Search to find relevant sources
search_url = "https://www.googleapis.com/customsearch/v1"
params = {
'key': self.api_key,
'cx': self.search_engine_id,
'q': ' '.join(keywords),
'num': 10
}
response = requests.get(search_url, params=params)
return response.json().get('items', [])
def scrape_content(self, url):
try:
response = requests.get(url, timeout=10)
content_type = response.headers.get('content-type', '').lower()
if 'application/pdf' in content_type:
return self.extract_pdf_text(response.content)
elif 'text/html' in content_type:
return self.extract_html_text(response.text)
elif 'application/xml' in content_type:
return self.extract_xml_text(response.text)
except Exception as e:
print(f"Error scraping {url}: {e}")
return ""
def extract_html_text(self, html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text and clean it
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text[:2000] # Limit to avoid token limits
This scraper handles multiple content formats and focuses on extracting clean, meaningful text for analysis.
Here's where the magic happens - comparing the claim against verified sources:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class FactChecker:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def calculate_reliability_score(self, claim, source_contents):
# Embed the original claim
claim_embedding = self.model.encode([claim])
# Embed all source contents
source_embeddings = self.model.encode(source_contents)
# Calculate cosine similarities
similarities = cosine_similarity(claim_embedding, source_embeddings)[0]
# Weighted reliability score
max_similarity = np.max(similarities)
avg_similarity = np.mean(similarities)
# Combine max and average for final score
reliability_score = (max_similarity * 0.7) + (avg_similarity * 0.3)
return {
'score': float(reliability_score),
'max_similarity': float(max_similarity),
'supporting_sources': len([s for s in similarities if s > 0.5]),
'total_sources': len(similarities)
}
The reliability score considers both the strongest supporting evidence and overall consensus across sources.
Raw similarity scores aren't enough - users need context:
import openai
class ExplanationGenerator:
def __init__(self, api_key):
openai.api_key = api_key
def generate_explanation(self, claim, reliability_score, top_sources):
prompt = f"""
Analyze this claim: "{claim}"
Reliability Score: {reliability_score['score']:.2f}
Supporting Sources: {reliability_score['supporting_sources']}/{reliability_score['total_sources']}
Top Sources:
{chr(10).join(top_sources[:3])}
Provide a clear, factual explanation of why this claim received this reliability score.
Include specific evidence from the sources that supports or contradicts the claim.
Keep it concise but informative.
"""
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
max_tokens=200
)
return response.choices[0].message.content
The second part of DiddyEye tackles visual misinformation:
import cv2
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import numpy as np
class AIImageDetector:
def __init__(self, model_path):
self.model = tf.keras.models.load_model(model_path)
self.input_shape = (224, 224, 3)
def preprocess_image(self, image_path):
img = image.load_img(image_path, target_size=self.input_shape[:2])
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array = img_array / 255.0 # Normalize
return img_array
def detect_ai_generated(self, image_path):
processed_image = self.preprocess_image(image_path)
prediction = self.model.predict(processed_image)
ai_probability = float(prediction[0][0])
return {
'is_ai_generated': ai_probability > 0.5,
'confidence': ai_probability,
'analysis': self.generate_heatmap(image_path, processed_image)
}
def generate_heatmap(self, image_path, processed_image):
# Generate gradient-based heatmap showing suspicious regions
with tf.GradientTape() as tape:
tape.watch(processed_image)
predictions = self.model(processed_image)
gradients = tape.gradient(predictions, processed_image)
gradients = tf.reduce_max(gradients, axis=-1)
gradients = gradients.numpy()[0]
# Convert to heatmap
heatmap = cv2.applyColorMap(
np.uint8(255 * gradients), cv2.COLORMAP_JET
)
return heatmap
The heatmap visualization helps users understand which parts of an image triggered the AI detection.
Bringing it all together in a user-friendly Telegram interface:
import telegram
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
class DiddyEyeBot:
def __init__(self, token):
self.bot = telegram.Bot(token=token)
self.fact_checker = FactChecker()
self.image_detector = AIImageDetector('models/ai_detector.h5')
def handle_text_message(self, update, context):
claim = update.message.text
# Show processing message
context.bot.send_message(
chat_id=update.effective_chat.id,
text="š Analyzing claim... This may take a few seconds."
)
# Process the claim
result = self.fact_checker.verify_claim(claim)
# Format response
reliability_emoji = "ā
" if result['score'] > 0.7 else "ā ļø" if result['score'] > 0.4 else "ā"
response = f"""
{reliability_emoji} **Reliability Score: {result['score']:.2f}/1.00**
š **Analysis:**
⢠Supporting Sources: {result['supporting_sources']}/{result['total_sources']}
⢠Confidence Level: {self.get_confidence_level(result['score'])}
š **Explanation:**
{result['explanation']}
š” **Recommendation:** {self.get_recommendation(result['score'])}
"""
context.bot.send_message(
chat_id=update.effective_chat.id,
text=response,
parse_mode='Markdown'
)
def handle_image_message(self, update, context):
# Download image
photo = update.message.photo[-1] # Get highest resolution
file = context.bot.get_file(photo.file_id)
file_path = f"temp_images/{photo.file_id}.jpg"
file.download(file_path)
# Analyze image
result = self.image_detector.detect_ai_generated(file_path)
# Format response
if result['is_ai_generated']:
response = f"""
š¤ **AI-Generated Image Detected**
Confidence: {result['confidence']:.2f}
This image appears to be artificially generated.
Be cautious about treating it as authentic evidence.
"""
else:
response = f"""
šø **Real Image Detected**
Confidence: {1-result['confidence']:.2f}
This image appears to be authentic photography.
"""
context.bot.send_message(
chat_id=update.effective_chat.id,
text=response
)
After testing with various claims and images:
Not all Google search results are reliable. We implemented domain scoring and content quality filters.
Claims often require context. Our embedding approach captures semantic meaning beyond just keywords.
API costs and limits required intelligent caching and request batching.
Balancing accuracy with speed for responsive user experience.
DiddyEye represents a new approach to combating misinformation - moving fact-checking from centralized newsrooms to distributed, AI-powered verification available to anyone.
Future enhancements planned:
The fight against misinformation is just beginning, but with AI tools like DiddyEye, we're giving truth a fighting chance in the age of instant messaging.
Want to try DiddyEye or contribute to the project? Check it out on GitHub and join the fight against fake news!