Skip to main content

cs2370 Notes: 33 AI Demos

··3 mins

(Thanks to Grok for some suggestions. Let’s see if any of it’s sample code works.)

More text generation (distilgpt2):

from transformers import pipeline
import torch

generator = pipeline("text-generation", 
                     model="distilgpt2", device=0)

# Generate text
prompt = "In a futuristic city, AI robots and humans coexist. The story begins when..."
output = generator(prompt, max_length=100, num_return_sequences=1)
print(output[0]["generated_text"])

Image classification:

from transformers import pipeline
from PIL import Image
import requests
from io import BytesIO

classifier = pipeline("image-classification", model="google/vit-base-patch16-224", device=0)

url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"
response = requests.get(url)
image = Image.open(BytesIO(response.content))

results = classifier(image)
for result in results[:3]:
    print(f"Label: {result['label']}, Score: {result['score']:.4f}")

Image Generation:

from diffusers import AutoPipelineForText2Image
import torch

pipeline = AutoPipelineForText2Image.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True
).to("cuda")

# Generate image
prompt = "A futuristic cityscape at sunset, cyberpunk style"
image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]

# Display and save image
image

Text to speech:

# Import libraries
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import soundfile as sf
from IPython.display import Audio

# Load model and processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cuda")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cuda")

# Input text
text = "Hello! This is a text-to-speech demo using SpeechT5 on an NVIDIA RTX 3060."

# Process text
inputs = processor(text=text, return_tensors="pt").to("cuda")

# Generate speech
speaker_embeddings = torch.zeros(1, 512).to("cuda")  # Default speaker embedding
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

# Save and play audio
sf.write("speecht5_output.wav", speech.cpu().numpy(), samplerate=16000)
Audio("speecht5_output.wav")

Another one:

# Import libraries
from transformers import AutoProcessor, BarkModel
import torch
import soundfile as sf
from IPython.display import Audio

# Load model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to("cuda")

# Input text
text = "In a galaxy far, far away, AI models create stunning audio experiences."

# Process text
inputs = processor(text, voice_preset="v2/en_speaker_6", return_tensors="pt").to("cuda")

# Generate speech
with torch.no_grad():
    audio = model.generate(**inputs).cpu().numpy()

# Save and play audio
sf.write("bark_output.wav", audio, samplerate=model.config.sampling_rate)
Audio("bark_output.wav")

Song generation?

# Install dependencies
# $ pip install audiocraft torch soundfile numpy

# Import libraries
from audiocraft.models import MusicGen
import torch
import soundfile as sf
from IPython.display import Audio

# Load MusicGen small model with mixed precision
model = MusicGen.get_pretrained('facebook/musicgen-small')
model.set_generation_params(duration=10)  # Generate 10-second clip
model.lm.to(dtype=torch.float16, device='cuda')  # Use fp16 on GPU

# Input text prompt
prompt = "A catchy pop instrumental with upbeat drums and electric guitar"

# Generate music
wav = model.generate([prompt], progress=True)[0].cpu().numpy()

# Save and play audio
sf.write("musicgen_output.wav", wav, model.sample_rate)
Audio("musicgen_output.wav")

Animation

# Install dependencies
# $ pip install diffusers transformers torch imageio pillow accelerate

# Import libraries
from diffusers import AnimateDiffPipeline, MotionAdapter
from diffusers.utils import export_to_gif
import torch
from IPython.display import Image as IPythonImage

# Load motion adapter and Stable Diffusion model
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
pipe = AnimateDiffPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    motion_adapter=adapter,
    torch_dtype=torch.float16,
    use_safetensors=True
).to("cuda")
pipe.enable_model_cpu_offload()  # Optimize for RTX 3060

# Input text prompt
prompt = "A cartoon robot dancing in a futuristic city, vibrant colors"

# Generate video frames
output = pipe(
    prompt,
    num_frames=16,  # Short clip for GIF
    guidance_scale=7.5,
    num_inference_steps=25
)
frames = output.frames[0]

# Export to animated GIF
export_to_gif(frames, "animated_robot.gif")

# Display GIF in Jupyter
IPythonImage(filename="animated_robot.gif")