
Welcome to Shadow Shift, the future of AI-powered data solutions.
Already have an account? Log in
Welcome to Shadow Shift, the future of AI-powered data solutions.
Already have an account? Log in
Shadow Shift transforms how you work with data. From automating dataset creation to generating privacy-safe synthetic data, our platform accelerates AI development while ensuring compliance and security.
Boost your productivity with our curated library of production-ready scripts
Generates huge text datasets of any type.
import torch
import torch.nn as nn
# ---------------------------
# Upgraded Transformer Model
# ---------------------------
class PromptToTextDatasetGeneratorModel(nn.Module):
def __init__(self,
vocab_size,
d_model=256,
num_encoder_layers=4,
num_decoder_layers=4,
nhead=8,
dim_feedforward=1024,
dropout=0.1,
max_seq_length=512):
super().__init__()
self.d_model = d_model
self.vocab_size = vocab_size
self.max_seq_length = max_seq_length
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_embedding = nn.Embedding(max_seq_length, d_model)
self.transformer = nn.Transformer(d_model=d_model,
nhead=nhead,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward,
dropout=dropout)
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, src, tgt):
seq_len_src, batch_size = src.size()
seq_len_tgt, _ = tgt.size()
src_pos = torch.arange(0, seq_len_src, device=src.device).unsqueeze(1).expand(seq_len_src, batch_size)
tgt_pos = torch.arange(0, seq_len_tgt, device=tgt.device).unsqueeze(1).expand(seq_len_tgt, batch_size)
src_emb = self.embedding(src) * (self.d_model ** 0.5) + self.pos_embedding(src_pos)
tgt_emb = self.embedding(tgt) * (self.d_model ** 0.5) + self.pos_embedding(tgt_pos)
output = self.transformer(src_emb, tgt_emb)
return self.fc_out(output)
# ---------------------------
# Dummy Tokenizer with Domain & Format Awareness
# ---------------------------
class DummyTokenizer:
def __init__(self, vocab_size=10000):
self.vocab_size = vocab_size
def encode(self, text, max_length=50):
torch.manual_seed(len(text)) # For reproducibility
tokens = torch.randint(10, self.vocab_size, (max_length,))
return tokens
def vocab_size(self):
return self.vocab_size
# ---------------------------
# Dummy Test - with prompt parsing
# ---------------------------
def dummy_test():
vocab_size = 10000
model = PromptToTextDatasetGeneratorModel(vocab_size=vocab_size)
tokenizer = DummyTokenizer(vocab_size)
# ๐งช Prompt including domain and format
prompt = "Generate a 100K-row medical dataset in CSV format with age, gender, bp, cholesterol."
response_format = "CSV"
domain = "Medical"
# Here you can later add a "prompt parser" module to extract rows, domain, format etc.
print("๐งช Parsed Format:", response_format)
print("๐ง Parsed Domain:", domain)
src_tokens = tokenizer.encode(prompt, max_length=50)
tgt_tokens = tokenizer.encode("age,gender,bp,...", max_length=60)
src = src_tokens.unsqueeze(1) # (src_seq_len, batch_size)
tgt = tgt_tokens.unsqueeze(1) # (tgt_seq_len, batch_size)
logits = model(src, tgt)
print("โ
Logits shape:", logits.shape) # Expected: (tgt_seq_len, batch_size, vocab_size)
dummy_test()
Scrapes web for face images.
# ================== SETUP ==================
!pip install aiohttp aiofiles duckduckgo-search beautifulsoup4 tqdm imagehash opencv-python-headless pyppeteer nest_asyncio
# ================== IMPORTS ==================
import os, cv2, aiohttp, asyncio, aiofiles, random, imagehash, zipfile
from PIL import Image
from tqdm import tqdm
from bs4 import BeautifulSoup
from duckduckgo_search import ddg
from pyppeteer import launch
import nest_asyncio
from IPython.display import FileLink
nest_asyncio.apply()
# ================== CONFIG ==================
KEYWORDS = ["celebrity face", "portrait photo", "actor face", "famous people closeup"]
MAX_RESULTS_PER_KEYWORD = 500
TARGET_IMAGE_COUNT = 100_000
DOWNLOAD_FOLDER = "ScrapedImages"
ZIP_FILENAME = "FaceDataset.zip"
CONCURRENT_TASKS = 100
PROXIES = [] # Add your proxy API pool here
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Mozilla/5.0 (X11; Linux x86_64)",
]
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
image_hashes = set()
image_count = 0
semaphore = asyncio.Semaphore(CONCURRENT_TASKS)
# ================== UTILS ==================
def get_headers():
return {"User-Agent": random.choice(USER_AGENTS)}
def get_proxy():
return random.choice(PROXIES) if PROXIES else None
def contains_face(image_path):
try:
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
return len(faces) > 0
except:
return False
def is_duplicate(img_path):
try:
img = Image.open(img_path).convert("RGB")
h = str(imagehash.phash(img))
if h in image_hashes:
return True
image_hashes.add(h)
return False
except:
return True
async def fetch_html_puppeteer(url):
try:
browser = await launch(headless=True, args=["--no-sandbox"])
page = await browser.newPage()
await page.setUserAgent(random.choice(USER_AGENTS))
await page.goto(url, timeout=30000)
content = await page.content()
await browser.close()
return content
except:
return None
def extract_image_links(html, base_url):
try:
soup = BeautifulSoup(html, "html.parser")
images = []
for img in soup.find_all("img"):
src = img.get("src")
if src and src.startswith("http"):
images.append(src)
elif src and src.startswith("/"):
images.append(base_url + src)
return images
except:
return []
async def download_and_filter_image(session, url, index, sub_index):
global image_count
filename = os.path.join(DOWNLOAD_FOLDER, f"{index}_{sub_index}.jpg")
try:
async with semaphore:
async with session.get(url, headers=get_headers(), proxy=get_proxy(), timeout=10) as resp:
if resp.status == 200:
f = await aiofiles.open(filename, "wb")
await f.write(await resp.read())
await f.close()
if contains_face(filename) and not is_duplicate(filename):
image_count += 1
else:
os.remove(filename)
except:
if os.path.exists(filename):
os.remove(filename)
async def scrape_and_download(session, url, index):
if image_count >= TARGET_IMAGE_COUNT:
return
html = await fetch_html_puppeteer(url)
if not html:
return
base_url = "/".join(url.split("/")[:3])
image_links = extract_image_links(html, base_url)
tasks = [
download_and_filter_image(session, img_url, index, i)
for i, img_url in enumerate(image_links)
if image_count < TARGET_IMAGE_COUNT
]
await asyncio.gather(*tasks)
async def main():
global image_count
print("[*] Collecting DuckDuckGo URLs...")
results = []
for keyword in KEYWORDS:
results += ddg(keyword, max_results=MAX_RESULTS_PER_KEYWORD)
urls = [res["href"] for res in results]
print(f"[*] Scraping from {len(urls)} pages...")
async with aiohttp.ClientSession() as session:
for idx, url in enumerate(tqdm(urls)):
if image_count >= TARGET_IMAGE_COUNT:
break
await scrape_and_download(session, url, idx)
print(f"\nโ
Downloaded {image_count} face images.")
print("[*] Zipping dataset...")
with zipfile.ZipFile(ZIP_FILENAME, 'w') as zipf:
for root, _, files in os.walk(DOWNLOAD_FOLDER):
for file in files:
zipf.write(os.path.join(root, file))
print("โ
Dataset ready.")
display(FileLink(ZIP_FILENAME))
await main()
Creates realistic patient datasets.
import random
import pandas as pd
from datetime import datetime, timedelta
# ================== CONFIG ==================
NUM_ROWS = 10_000
GENDERS = ["Male", "Female", "Other"]
def generate_random_date():
start_date = datetime.strptime('2000-01-01', '%Y-%m-%d')
end_date = datetime.today()
delta = end_date - start_date
random_days = random.randint(0, delta.days)
return (start_date + timedelta(days=random_days)).strftime('%Y-%m-%d')
def generate_patient_data(index):
age = random.randint(18, 90)
gender = random.choice(GENDERS)
# Blood pressure: systolic/diastolic based on age
if age < 40:
systolic = random.randint(100, 130)
diastolic = random.randint(60, 85)
elif age < 60:
systolic = random.randint(110, 145)
diastolic = random.randint(70, 90)
else:
systolic = random.randint(120, 160)
diastolic = random.randint(75, 95)
# Cholesterol (mg/dL)
cholesterol = random.randint(150, 280)
return {
"PatientID": f"P{100000 + index}",
"Age": age,
"Gender": gender,
"SystolicBP": systolic,
"DiastolicBP": diastolic,
"Cholesterol": cholesterol,
"CheckupDate": generate_random_date()
}
# ================== GENERATE ==================
data = [generate_patient_data(i) for i in range(NUM_ROWS)]
df = pd.DataFrame(data)
# ================== SAVE ==================
df.to_csv("patient_data.csv", index=False)
print("โ
Dataset created as 'patient_data.csv'")
df.head()
Synthetic datasets generator Create realistic, private synthetic datasets
Your generated synthetic data will appear here...
On-demand, customizable synthetic data for AI training with privacy guarantees
Generate synthetic patient data for medical AI without privacy risks
Create realistic transaction data for fraud detection systems
Simulate customer behavior for demand forecasting and inventory optimization
Generate synthetic sensor data for predictive maintenance models
Simulate rare driving scenarios for safer AI training
Tailored synthetic data for your specific industry and use case
Create perfect datasets in just 3 simple steps
Tell us what data you need through our simple interface or API
Our system automatically collects, generates, cleans, and structures your data
Get your dataset in multiple formats, ready for analysis
Generate real or synthetic datasets in minutes, not days. Our optimized pipelines deliver results at lightning speed.
Our synthetic data platform eliminates privacy concerns while maintaining data utility for AI training.
From scripts to synthetic data generation, we provide production-ready solutions for your projects.
Custom solutions for any industry with domain-specific data generation.
Join thousands of professionals who rely on Shadow Shift
Data Scientist, TechCorp
"Shadow Shift's synthetic data platform allowed us to train our medical AI models without compromising patient privacy. The data quality was indistinguishable from real data for our use case."
ML Engineer, StartupX
"The combination of real data automation and synthetic data generation has cut our development cycles in half. We can now test scenarios we could never collect real data for."
Choose a plan that fits your dataset building speed.
Get answers to common questions about Shadow Shift
Join thousands of data professionals who save hours every week with Shadow Shift's data automation and synthetic data solutions.
HEAD