In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('mbti_1.csv')

# Encode MBTI types
le = LabelEncoder()
data['type_code'] = le.fit_transform(data['type'])

# Train/test split for text posts
X_text = data['posts']
y = data['type_code']
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)


In [6]:
import numpy as np

num_train = X_train_tfidf.shape[0]
num_test = X_test_tfidf.shape[0]
num_questions = 5

# Simulate 0/1 answers for train and test
np.random.seed(42)
X_train_q = np.random.randint(0, 2, size=(num_train, num_questions))
X_test_q = np.random.randint(0, 2, size=(num_test, num_questions))


In [7]:
from scipy.sparse import hstack

X_train_combined = hstack([X_train_tfidf, X_train_q])
X_test_combined = hstack([X_test_tfidf, X_test_q])


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)

print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy on test set: 0.5688760806916426
Classification report:
               precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        41
        ENFP       0.77      0.39      0.52       125
        ENTJ       0.38      0.07      0.12        44
        ENTP       0.71      0.41      0.52       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       0.00      0.00      0.00        15
        INFJ       0.54      0.68      0.60       288
        INFP       0.50      0.88      0.63       370
        INTJ       0.57      0.59      0.58       193
        INTP       0.65      0.74      0.70       293
        ISFJ       1.00      0.02      0.04        45
        ISFP       1.00      0.08      0.14        53
        ISTJ       0.67      0.05      0.09        44
        ISTP       0.76      0.33      0.46        67

    accuracy   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import joblib

joblib.dump(model, "hybrid_personality_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(le, "label_encoder.joblib")


['label_encoder.joblib']

In [11]:
!pip install sentence-transformers chromadb joblib numpy


Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [

In [18]:
import numpy as np
from scipy.sparse import hstack
import joblib
import chromadb
import json

# Load pretrained model and vectorizer
model = joblib.load("hybrid_personality_model.joblib")
vectorizer = joblib.load("tfidf_vectorizer.joblib")
le = joblib.load("label_encoder.joblib")

# Load MBTI descriptions (JSON file with descriptions keyed by MBTI types)
with open("personality_descriptions.json", "r") as f:
    personality_descriptions = json.load(f)

# Define the MBTI questionnaire
questions = [
    ("At social events, I usually:", "Meet and talk with many new people", "Stick with a small group of close friends"),
    ("When focusing on information, I prefer:", "Concrete facts and practical details", "Abstract ideas and imaginative concepts"),
    ("When making decisions, I rely on:", "Logic and objective analysis", "Feelings and harmony"),
    ("My work style tends to be:", "Organized and planned", "Flexible and spontaneous"),
    ("I value:", "Fairness and impartiality", "Harmony and kindness"),
]

print("Please enter a brief description about yourself:")
user_text = input("> ").strip()

answers = []
for idx, (q, a, b) in enumerate(questions):
    print(f"\nQ{idx+1}: {q}")
    print(f"  1. {a}")
    print(f"  2. {b}")
    while True:
        inp = input("Choose 1 or 2: ").strip()
        if inp in ("1", "2"):
            answers.append(int(inp) - 1)
            break
        else:
            print("Invalid choice, please enter 1 or 2.")

# Vectorize the user text using the original TF-IDF vectorizer
text_vec = vectorizer.transform([user_text])

# Convert answers to numpy array and reshape
answer_vec = np.array(answers).reshape(1, -1)

# Combine text features (sparse) + answer features (dense)
hybrid_vec = hstack([text_vec, answer_vec])

# Predict MBTI type using loaded model
pred_code = model.predict(hybrid_vec)[0]
pred_type = le.inverse_transform([pred_code])[0]
description = personality_descriptions.get(pred_type, "Description not available.")

print(f"\nYour MBTI personality type is: {pred_type}")
print(description)

# Store profile in ChromaDB vector database
client = chromadb.Client()
collection = client.get_or_create_collection(name='personality_profiles')

metadata = {
    "mbti_type": pred_type,
    "answers": json.dumps(answers),
    "user_text": user_text
}

import uuid

unique_id = str(uuid.uuid4())  # generate unique ID for this record
collection.add(
    embeddings=hybrid_vec.toarray().tolist(),
    metadatas=[metadata],
    ids=[unique_id]
)


print("\nYour profile has been saved to the personality database.")


Please enter a brief description about yourself:
> I am a sad person 

Q1: At social events, I usually:
  1. Meet and talk with many new people
  2. Stick with a small group of close friends
Choose 1 or 2: 2

Q2: When focusing on information, I prefer:
  1. Concrete facts and practical details
  2. Abstract ideas and imaginative concepts
Choose 1 or 2: 1

Q3: When making decisions, I rely on:
  1. Logic and objective analysis
  2. Feelings and harmony
Choose 1 or 2: 2

Q4: My work style tends to be:
  1. Organized and planned
  2. Flexible and spontaneous
Choose 1 or 2: 1

Q5: I value:
  1. Fairness and impartiality
  2. Harmony and kindness
Choose 1 or 2: 2

Your MBTI personality type is: INFP
INFP - The Mediator (Introverted, Intuitive, Feeling, Perceiving)

Overview:
INFPs are idealistic, creative, and deeply caring individuals who value authenticity and meaningful connections.

Key Traits:
- Reflective and reserved
- Imaginative and open-minded
- Empathetic and sensitive
- Value-dr

In [19]:
client = chromadb.Client()
collection = client.get_collection(name='personality_profiles')

In [20]:
results = collection.get()

print("Stored profile IDs:", results['ids'])
print("Stored metadata example:", results['metadatas'])


Stored profile IDs: ['ff6ea2d8-0b78-47ea-b125-0d9baec116a2', '3665925b-1b07-489b-9108-7f4ad3914618']
Stored metadata example: [{'user_text': 'I am a calm person and an extrovert. I love to to explore things', 'mbti_type': 'INFP', 'answers': '[0, 1, 0, 1, 1]'}, {'mbti_type': 'INFP', 'answers': '[1, 0, 1, 0, 1]', 'user_text': 'I am a sad person'}]
