For Faheem · Based on David Lawton's production code at Fluency · April 2026
DeepSeek uses the OpenAI-compatible API. Same Python library, one line different.
CopyDavid's config
from openai import OpenAI
client = OpenAI(
organization='org-5AXapliFSQImDsgpiF6PJs0t',
api_key="sk-proj-..."
)
base_model = "gpt-3.5-turbo"
Copyconfig.py
from openai import OpenAI
# Same library — just change base_url and model
client = OpenAI(
api_key="sk-your-deepseek-api-key",
base_url="https://api.deepseek.com" # only difference
)
MODEL = "deepseek-chat" # DeepSeek V3
David's original code parses GPT responses by splitting on newlines and colons. This breaks constantly when GPT changes formatting.
CopyDavid's parser — breaks often
# Splits on newlines and colons — one extra newline crashes everything
row['Result'].split('\n')[0].split(':')[1].strip() # sentiment
row['Result'].split('\n')[2].split(':')[1].strip() # trust
row['Result'].split('\n')[4].split(':')[1].strip() # explanation
row['Result'].split('\n')[6].split(':')[1].strip() # attributes
CopyDavid's clean_gpt_json
import re, json
def clean_gpt_json(gpt_response):
# Remove control characters (vertical tabs, etc.)
cleaned = re.sub(r'[\x00-\x1f\x7f]', ' ', gpt_response)
# Fix incorrectly quoted booleans
cleaned = re.sub(r'"(true|false|null)"', r'\1', cleaned)
# Strip markdown fences
cleaned = re.sub(r'```(json)?', '', cleaned).strip()
return json.loads(cleaned)
Copyparse_response.py
import json, re
def parse_llm_response(raw: str) -> dict:
"""Parse LLM JSON response. Handles markdown fences,
control chars, capitalisation issues, and confidence clamping."""
# Strip markdown fences (DeepSeek sometimes wraps in ```json)
if raw.startswith("```"):
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0].strip()
# Remove control characters
raw = re.sub(r'[\x00-\x1f\x7f]', ' ', raw)
result = json.loads(raw)
# Normalise capitalisation (David's known GPT issue)
if "sentiment" in result:
result["sentiment"] = result["sentiment"].strip().capitalize()
if "emotion" in result:
result["emotion"] = result["emotion"].strip().lower()
if "intent" in result:
result["intent"] = result["intent"].strip().lower()
# Clamp confidence to valid range
result["confidence"] = max(0.0, min(1.0,
float(result.get("confidence", 0.5))))
return result
David's prompts all follow the same structure: role assignment, context, task list, output format. We keep the structure but replace the generic role with our 5-section rich system prompt.
CopyDavid's prompt — government trust analysis
def get_response(model, tokens, post, comment):
prompt = f"""
You are a political analyst analysing comments on social media.
You will see both the outbound post from government and the
comment back. Your tasks are to:
1. Classify the sentiment as Positive, Neutral or Negative
2. Identify trust attributes (comma separated)
3. Rate trust on a scale of 1-100
Post: {post}
Comment: {comment}
"""
messages = [{"role": "user", "content": prompt}]
response = client.chat.completions.create(
model=model, messages=messages,
max_tokens=tokens, temperature=0.0
)
return response.choices[0].message.content
Copyprompts/deepseek_v1.json
{
"version": "v1",
"role": "You are an expert classifier for British social media
posts from Facebook entertainment and community pages.",
"taxonomy": {
"sentiment": {
"Positive": "Post expresses approval, joy, amusement, support...",
"Negative": "Post expresses disapproval, anger, frustration...",
"Neutral": "Post is informational, no clear emotion signal"
},
"emotion": {
"joy": "Amusement, happiness, delight. Laughing emoji dominant",
"anger": "Frustration, outrage, irritation. Explicit complaints",
"sadness": "Grief, loss, disappointment",
"fear": "Worry, anxiety, concern about safety",
"surprise": "Unexpected reaction, shock",
"love": "Affection, admiration. Heart emoji + long text",
"neutral": "No emotional signal. Factual or dry statement"
}
},
"rules": [
"IF laughing emoji on short caption THEN joy NOT anger",
"IF 'cool cool cool' or 'saving now' THEN negative (sarcasm)",
"IF prison page + real named event THEN topic=Crime NOT Humour",
"IF present tense declaration THEN intent=Announce NOT Inform",
"IF long post (200+ words) with heart emoji THEN love NOT joy"
],
"few_shots": [
{
"post": "When your cellmate steals your last biscuit 😂🤣",
"labels": {"sentiment": "Positive", "emotion": "joy",
"topic": "Humour", "intent": "Entertain"},
"reason": "Laughing emoji is dominant signal. Comedic observation."
}
],
"output_format": "Return ONLY valid JSON: {\"sentiment\", \"emotion\",
\"topic\", \"intent\", \"toxicity\", \"confidence\": 0-1, \"reasoning\"}"
}
Copyclassifier.py
import json
from config import client, MODEL
from parse_response import parse_llm_response
def load_system_prompt(version: str = "v1") -> str:
"""Load versioned prompt and assemble into system message."""
with open(f"prompts/deepseek_{version}.json") as f:
cfg = json.load(f)
sections = [
cfg["role"],
"\n\nTAXONOMY (locked):\n" + json.dumps(cfg["taxonomy"], indent=2),
"\n\nDECISION RULES:\n" + "\n".join(cfg["rules"]),
"\n\nEXAMPLES:\n" + json.dumps(cfg["few_shots"], indent=2),
"\n\nOUTPUT FORMAT:\n" + cfg["output_format"]
]
return "\n".join(sections)
def classify_post(post_text: str, system_prompt: str) -> dict:
"""Classify a single post using the rich system prompt.
Returns dict: sentiment, emotion, topic, intent, toxicity,
confidence, reasoning."""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Classify this post:\n{post_text}"}
]
try:
response = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=0.0,
max_tokens=500,
)
raw = response.choices[0].message.content.strip()
return parse_llm_response(raw)
except (json.JSONDecodeError, KeyError) as e:
print(f"Parse error: {e}")
return None
except Exception as e:
print(f"API error: {e}")
return None
David uses ThreadPoolExecutor in every notebook: 100 workers for Apify scraping, 8 for classification. We use the same pattern with rate limit handling.
CopyDavid's concurrent pattern
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
def run_concurrently(urls, max_workers=100):
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(facebook_comments, url)
for url in urls]
for future in tqdm(as_completed(futures), total=len(futures)):
try:
results.append(future.result())
except Exception as e:
print(f"Error: {e}")
return results
Copybatch_classifier.py
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from classifier import classify_post
def classify_batch(df: pd.DataFrame,
system_prompt: str,
text_col: str = "post_content",
max_workers: int = 8) -> pd.DataFrame:
"""Classify all posts in a DataFrame.
Same ThreadPoolExecutor pattern as David, with:
- Rate limit backoff (429 handling)
- Progress bar (tqdm)
- Error collection
- Skip posts with text < 10 chars
"""
results = [None] * len(df)
errors = []
def _process(idx, text):
try:
return idx, classify_post(text, system_prompt)
except Exception as e:
if "429" in str(e):
time.sleep(2) # rate limit backoff
return idx, classify_post(text, system_prompt)
raise
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {}
for idx, row in df.iterrows():
text = str(row[text_col]) if pd.notna(row[text_col]) else ""
if not text.strip() or len(text) < 10:
continue
future = executor.submit(_process, idx, text)
futures[future] = idx
for future in tqdm(as_completed(futures),
total=len(futures), desc="Classifying"):
try:
idx, result = future.result()
results[idx] = result
except Exception as e:
errors.append((futures[future], str(e)))
# Unpack into columns (same as David's pattern)
for col in ["sentiment", "emotion", "intent", "topic",
"toxicity", "confidence", "reasoning"]:
df[col] = [r.get(col) if r else None for r in results]
print(f"\nClassified {len(futures)} posts | Errors: {len(errors)}")
return df
David trusts every GPT result. We add confidence-based routing. This is our addition — no equivalent in David's code.
Copytier_router.py
def apply_tier_routing(df: pd.DataFrame) -> pd.DataFrame:
"""Route posts through ArtemisAI's 3-tier system.
Tier 1 (>0.85): Accept our model's label. Free. ~75%
Tier 2 (0.60-0.85): DeepSeek Agent verifies. ~18%
Tier 3 (<0.60): DeepSeek Agent relabels from scratch. ~7%
"""
def assign_tier(conf):
if conf is None: return 3
if conf >= 0.85: return 1
if conf >= 0.60: return 2
return 3
df["tier"] = df["confidence"].apply(assign_tier)
# Summary
for tier in [1, 2, 3]:
count = (df["tier"] == tier).sum()
pct = count / len(df) * 100
print(f" Tier {tier}: {count:,} posts ({pct:.1f}%)")
return df
David runs HuggingFace models locally for emotion/sentiment. This is the same pattern our fine-tuned models will use.
CopyDavid's emotion model — Emotions.ipynb
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(
"joeddav/distilbert-base-uncased-go-emotions-student"
)
model = AutoModelForSequenceClassification.from_pretrained(
"joeddav/distilbert-base-uncased-go-emotions-student"
)
# Inference loop
for x in tqdm(range(len(df))):
snippet = df['Post'][x]
inputs = tokenizer(snippet, return_tensors="pt",
truncation=True, max_length=512)
outputs = model(**inputs)
probs = outputs.logits.softmax(dim=-1).tolist()[0]
db.append(probs)
Copylocal_models.py
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Load our fine-tuned model (trained on 20K labelled posts)
tokenizer = AutoTokenizer.from_pretrained("./models/sentiment_v1")
model = AutoModelForSequenceClassification.from_pretrained(
"./models/sentiment_v1"
)
def predict_sentiment(text: str) -> dict:
inputs = tokenizer(text, return_tensors="pt",
truncation=True, max_length=256)
with torch.no_grad():
outputs = model(**inputs)
probs = outputs.logits.softmax(dim=-1)[0]
labels = ["Negative", "Neutral", "Positive"]
pred_idx = probs.argmax().item()
return {
"label": labels[pred_idx],
"confidence": probs[pred_idx].item(),
"all_probs": {l: p.item() for l, p in zip(labels, probs)}
}
Copymain.py — complete weekly pipeline
import pandas as pd
import json
from classifier import classify_post, load_system_prompt
from batch_classifier import classify_batch
from tier_router import apply_tier_routing
from local_models import predict_sentiment, predict_emotion
from local_models import predict_topic, predict_intent
from local_models import predict_toxicity, predict_language
# ── Step 1: Load the rich system prompt (versioned) ──
system_prompt = load_system_prompt("v4")
# ── Step 2: Load new posts from Redshift ──
df = pd.read_sql("""
SELECT post_id, post_content, page_name, created_time
FROM odl.dim_posts
WHERE LENGTH(post_content) > 10
AND created_time > CURRENT_DATE - INTERVAL '7 days'
""", conn)
# ── Step 3: Our 6 fine-tuned models label everything ──
for task, fn in [("sentiment", predict_sentiment),
("emotion", predict_emotion),
("topic", predict_topic),
("intent", predict_intent),
("toxicity", predict_toxicity),
("language", predict_language)]:
results = [fn(text) for text in tqdm(df["post_content"])]
df[task] = [r["label"] for r in results]
df[f"{task}_conf"] = [r["confidence"] for r in results]
# ── Step 4: Overall confidence = min across all tasks ──
conf_cols = [c for c in df.columns if c.endswith("_conf")]
df["confidence"] = df[conf_cols].min(axis=1)
# ── Step 5: Apply tier routing ──
df = apply_tier_routing(df)
# ── Step 6: Tier 2/3 go to DeepSeek Agent ──
tier2_3 = df[df["tier"].isin([2, 3])].copy()
tier2_3 = classify_batch(tier2_3, system_prompt, max_workers=8)
df.update(tier2_3)
# ── Step 7: Low-conf DeepSeek posts go to Claude ──
low_conf = df[(df["tier"].isin([2, 3])) & (df["confidence"] < 0.80)]
# ... Claude verification and prompt update logic
# ── Step 8: Write to Redshift ──
df.to_sql("post_labels", conn, if_exists="append", index=False)
print(f"Done: {len(df)} posts labelled")
| Pattern | David (Fluency) | Us (ArtemisAI) |
|---|---|---|
| API client | OpenAI GPT-3.5-turbo | DeepSeek V3 (same lib, change base_url) |
| Prompt | Generic role + task list | Rich versioned 5-section system prompt |
| Output parsing | Split on newlines (fragile) | JSON forced output + parse_llm_response() |
| Concurrency | ThreadPoolExecutor (100 workers) | Same pattern (8 workers + rate limit backoff) |
| Local models | distilbert, twitter-roberta | XLM-RoBERTa fine-tuned on 20K posts |
| Topic modelling | LDA (unsupervised, 40 clusters) | LLM classification (locked 13 labels) |
| Confidence | None | 0.0-1.0 score per prediction |
| Quality control | Trust every result | 3-tier routing + Claude verification |
| Cost per post | ~$0.002 (GPT) | ~$0.0003 (DeepSeek) = 7x cheaper |
David uses HuggingFace's zero-shot-classification pipeline to classify posts into custom categories without any training data. He defines tag pairs and the model scores each post. This is powerful for new classification tasks before we have labelled data.
CopyDavid's zero-shot — Emotions.ipynb
from transformers import pipeline
classifier = pipeline("zero-shot-classification")
# Define tag pairs — the model scores both sides
tag_list = [
['Rational', 'Emotional'],
['Self Revealing', 'Fact Oriented'],
['Action Seeking', 'Information Seeking']
]
tags = tag_list[2] # pick a pair
dfs = []
for x in tqdm(range(len(df))):
text = df['Post'][x]
res = classifier(str(text), tags, multi_label=False)
df_temp = pd.DataFrame([res['scores']])
df_temp.columns = res['labels']
df_temp['Post'] = str(df['Post'][x])
dfs.append(df_temp)
pd.concat(dfs).to_clipboard()
Copyzero_shot_classifier.py
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
def zero_shot_classify(df: pd.DataFrame,
text_col: str,
labels: list,
multi_label: bool = False) -> pd.DataFrame:
"""Classify posts into custom categories with no training data.
Use this when:
- A new client asks for a category we have never trained on
- We want to prototype a new NLP task quickly
- We need psycholinguistic profiling (David's use case)
Args:
labels: e.g. ['Rational', 'Emotional'] or
['product_feedback', 'customer_service', 'general']
"""
results = []
for text in tqdm(df[text_col], desc="Zero-shot"):
res = classifier(str(text), labels, multi_label=multi_label)
row = {label: score
for label, score in zip(res['labels'], res['scores'])}
row['predicted'] = res['labels'][0]
row['confidence'] = res['scores'][0]
results.append(row)
return pd.concat([df.reset_index(drop=True),
pd.DataFrame(results)], axis=1)
# Example: psycholinguistic profiling (David's exact use case)
df = zero_shot_classify(df, "post_content",
["Rational", "Emotional"])
# Example: new client wants virality type scoring
df = zero_shot_classify(df, "post_content",
["viral_potential", "niche_content",
"evergreen", "time_sensitive"],
multi_label=True)
David structures multi-attribute scoring prompts for brand analysis. Each driver gets a -1/0/+1 score. He uses this for Asda, Goodyear, Barclays, and Bobcat. This pattern is directly applicable to our future brand sentiment features.
CopyDavid's brand equity — OpenAI_Basic.ipynb
def get_response(model, tokens, post):
prompt = f"""
Please analyze the following social media post to assess its
sentiment toward Asda across specific brand equity drivers.
Each driver should be rated as follows:
- **1** if the driver is present and has a positive sentiment
- **0** if the driver is not relevant or neutral
- **-1** if the driver is present and has a negative sentiment
The drivers are: status, control, connection, individuality,
improvement, safety, nurturing, vitality, exploration, pleasure.
Here is the post: {post}
Please return your response as a JSON object with each driver
as a key and the score as the value.
"""
messages = [{"role": "user", "content": prompt}]
response = client.chat.completions.create(
model=model, messages=messages,
max_tokens=tokens, temperature=0.0
)
return response
CopyDavid's pillar classification — OpenAI_Basic.ipynb
def get_response(model, tokens, post_message):
pillars = {
"Move your Mountains": """Build trust and value in Bobcat
by empowering our community with knowledge and resources.
Example: Step-by-step guides, Expert advice, Tool tips""",
"Change What's Possible": """Showcase innovation and
versatility. Example: New attachments, Product launches""",
}
prompt = f"""Classify this post into one of these pillars:
{json.dumps(pillars, indent=2)}
Post: {post_message}
Return JSON: {{"pillar": "pillar name"}}"""
...
# Batch loop
db = []
for post in tqdm(df['Post']):
res = get_response(base_model, 400, post)
try:
db.append([post, json.loads(res.choices[0].message.content)['pillar']])
except:
pass
Copybrand_equity.py — future Sprint 9+ feature
def build_brand_equity_prompt(page_name: str,
drivers: dict) -> str:
"""Build a brand equity scoring prompt.
David's pattern: define drivers with descriptions,
score each -1/0/+1, return as JSON.
Args:
page_name: e.g. "The HMP Bible"
drivers: dict of driver_name: description
"""
driver_text = "\n".join(
[f"- {name}: {desc}" for name, desc in drivers.items()]
)
return f"""Analyze this Facebook post from {page_name} and
score each engagement driver:
{driver_text}
Score each driver:
1 = driver present, positive engagement
0 = not relevant or neutral
-1 = driver present, negative engagement
Return ONLY valid JSON with driver names as keys and scores as values.
Include "reasoning": brief explanation."""
# ArtemisAI engagement drivers (customisable per client)
ARTEMIS_DRIVERS = {
"humour": "Post uses comedy, memes, or wit to engage",
"community": "Post builds in-group identity or belonging",
"outrage": "Post provokes anger or moral indignation",
"nostalgia": "Post references shared memories or past",
"empathy": "Post shows vulnerability or seeks support",
"information": "Post provides useful facts or news",
}
David classifies whether a comment is relevant to the original post (boolean True/False). Simple but powerful for filtering noise. He also has a separate spam detection prompt for construction equipment comments.
CopyDavid's cutthrough — OpenAI_Basic.ipynb
def get_prompt(post, comment):
prompt_template = f'''You are an analyst analyzing comments on
social media. You will see both the outbound post from the
government and the comment back. You need to classify the
comment as relevant to the outbound post or not.
Please evaluate the relevance of the comment & return a
boolean True or False for the classification.
**Here is the post:**
{post}
**Here is the comment:**
{comment}
'''
return prompt_template
CopyDavid's spam detection — Apify_Comments__1_.ipynb
def get_response(model, tokens, post):
prompt = f"""
Analyze the following comment about a construction loader
and provide the following information in JSON format:
1. "is_spam": Is this comment spam? Return True or False.
2. "is_listing": Is this a product listing trying to sell?
Return True or False.
3. "is_competitor_mention": Does it mention a competitor?
Return True or False.
4. "sentiment": Positive, Neutral, or Negative.
Comment: {post}
"""
...
Copyspam_filter.py
def build_relevance_prompt(post_text: str,
comment_text: str) -> str:
"""Check if a comment is relevant to the post.
David's pattern: boolean classification.
Use before NLP classification to filter noise.
Irrelevant comments get excluded from training data.
"""
return f"""You are a social media analyst. Determine if
this comment is relevant to the original post.
Return ONLY valid JSON:
{{"is_relevant": true/false, "is_spam": true/false,
"reason": "one sentence"}}
Original post: {post_text}
Comment: {comment_text}"""
def filter_spam_comments(df: pd.DataFrame) -> pd.DataFrame:
"""Pre-filter comments before NLP classification.
Removes:
- Spam (promotional links, bot comments)
- Irrelevant comments (not about the post)
- Non-English comments (unless page is multilingual)
Run this BEFORE classify_batch() to save API costs.
"""
system_prompt = "You filter social media comments for relevance."
results = classify_batch(df, system_prompt,
text_col="comment_text",
max_workers=8)
df["is_relevant"] = results["is_relevant"]
df["is_spam"] = results["is_spam"]
clean = df[(df["is_relevant"] == True) &
(df["is_spam"] == False)]
print(f"Filtered: {len(df)} -> {len(clean)} "
f"({len(df)-len(clean)} removed)")
return clean
David builds audience segments using K-Modes clustering on survey data, then trains an XGBoost classifier so new respondents can be assigned to segments automatically. This is the same concept as our model training: cluster first to find patterns, then build a classifier to apply those patterns at scale.
CopyDavid's K-Modes clustering — BHA_Segmentation.ipynb
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
# Step 1: Find optimal number of clusters (elbow method)
cost = []
K = range(1, 20)
for k in list(K):
kmode = KModes(n_clusters=k, init="random", n_init=5)
kmode.fit_predict(df_label[variables].fillna('Other'))
cost.append(kmode.cost_)
plt.plot(K, cost, 'x-')
plt.xlabel('No. of clusters')
plt.title('Elbow Curve')
plt.show()
# Step 2: Cluster with optimal k
kmode = KModes(n_clusters=14, init="random", n_init=5)
clusters = kmode.fit_predict(df_label[variables].fillna('Other'))
df['Cluster'] = clusters
CopyDavid's XGBoost classifier — BHA_Segmentation.ipynb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['New seg'])
# Features = subset of survey questions ("golden questions")
X = df.drop(columns='New seg').fillna(-1)
# Train XGBoost to predict segment from golden questions
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)
# Check accuracy
df_res = pd.DataFrame(model.predict(X))
df_res['y'] = y
df_res['accurate'] = (df_res[0] == df_res['y']).astype(int)
print(df_res['accurate'].mean()) # accuracy score
Copyconceptual parallel — how this maps to our pipeline
# David's approach:
# 1. Cluster survey data to find segments (unsupervised)
# 2. Train XGBoost on golden questions to PREDICT segments
# 3. New survey respondents get auto-assigned to a segment
#
# Our approach (same concept, different domain):
# 1. Claude + DeepSeek label 18K posts (= creating "segments")
# 2. Train 6 NLP models on labelled data to PREDICT labels
# 3. New posts get auto-labelled by the trained models
#
# The principle is identical:
# - First: create ground truth (clustering / LLM labelling)
# - Then: train a cheap fast model to replicate it at scale
# - Finally: deploy for automatic classification of new data
# David's golden question idea applies to us too:
# Not all features matter equally. XGBoost feature importance
# shows which survey questions predict segments best.
# Similarly, our weighted loss shows which training examples
# matter most (Claude labels > DeepSeek > synthetic).
ArtemisAI · Implementation Guide · April 2026 · Asad