diff --git a/backend/src/services/studio.js b/backend/src/services/studio.js index fbc197c..c15e356 100644 --- a/backend/src/services/studio.js +++ b/backend/src/services/studio.js @@ -10,6 +10,7 @@ const MasteringSessionsDBApi = require('../db/api/mastering_sessions'); const ExportsDBApi = require('../db/api/exports'); const SongMetadataDBApi = require('../db/api/song_metadata'); const CoverArtworksDBApi = require('../db/api/cover_artworks'); +const AssetsDBApi = require('../db/api/assets'); const { Op } = db.Sequelize; @@ -98,13 +99,42 @@ function buildDescription({ genreName, languageName, promptText, vocalMode, targ .join('. '); } +function sanitizeUploadedAudioFile(rawFile) { + if (!rawFile || typeof rawFile !== 'object') { + return null; + } + + const name = `${rawFile.name || ''}`.trim(); + const privateUrl = `${rawFile.privateUrl || ''}`.trim(); + const publicUrl = `${rawFile.publicUrl || ''}`.trim(); + + if (!name || !privateUrl || !publicUrl || !rawFile.new) { + return null; + } + + if (!/\.(mp3|wav)$/i.test(name) && !/\.(mp3|wav)$/i.test(privateUrl)) { + throw badRequest('Only MP3 and WAV vocal uploads are supported right now.'); + } + + const sizeInBytes = Number(rawFile.sizeInBytes); + + return { + id: rawFile.id || undefined, + name, + sizeInBytes: Number.isFinite(sizeInBytes) && sizeInBytes > 0 ? sizeInBytes : null, + privateUrl, + publicUrl, + new: true, + }; +} + function generateIsrc() { const year = new Date().getFullYear().toString().slice(-2); const random = Math.random().toString().slice(2, 7); return `ZA-AIM-${year}-${random}`; } -function mapSessionSummary(project, song, generationRequest, mixSession, masteringSession, exportJob, recordingSession, songMetadata, coverArtwork) { +function mapSessionSummary(project, song, generationRequest, mixSession, masteringSession, exportJob, recordingSession, songMetadata, coverArtwork, vocalAsset, vocalUpload) { return { project: { id: project.id, @@ -133,6 +163,15 @@ function mapSessionSummary(project, song, generationRequest, mixSession, masteri href: `/recording_sessions/${recordingSession.id}`, } : null, + vocalAsset: vocalAsset + ? { + id: vocalAsset.id, + name: vocalAsset.name, + fileName: vocalUpload?.name || vocalAsset.name, + publicUrl: vocalUpload?.publicUrl || null, + href: `/assets/${vocalAsset.id}`, + } + : null, mixSession: { id: mixSession.id, status: mixSession.status, @@ -369,6 +408,12 @@ module.exports = class StudioService { throw badRequest('Target BPM must be between 60 and 180.'); } + const uploadedVocal = sanitizeUploadedAudioFile(data.vocalUpload); + + if (data.vocalMode === 'upload' && !uploadedVocal) { + throw badRequest('Upload an MP3 or WAV vocal take before launching the session.'); + } + const scopedWhere = organizationId ? { organizationsId: organizationId } : {}; const [genre, language, selectedPreset, musicModel] = await Promise.all([ db.genres.findOne({ where: { id: data.genreId, ...scopedWhere } }), @@ -444,12 +489,30 @@ module.exports = class StudioService { { currentUser, transaction }, ); + const vocalAsset = uploadedVocal + ? await AssetsDBApi.create( + { + asset_type: 'audio', + audio_role: 'vocal_raw', + name: uploadedVocal.name, + uploaded_user: currentUser.id, + project: project.id, + song: song.id, + is_stereo: false, + organizations: organizationId, + file_blobs: [uploadedVocal], + }, + { currentUser, transaction }, + ) + : null; + const generationRequest = await GenerationRequestsDBApi.create( { project: project.id, song: song.id, requested_user: currentUser.id, model: musicModel?.id || null, + input_asset: vocalAsset?.id || null, request_type: data.vocalMode === 'upload' ? 'generate_beat_from_vocals' : 'generate_beat_from_text', prompt_text: promptText, target_genre: genre.id, @@ -490,6 +553,7 @@ module.exports = class StudioService { instrument: track.instrument, volume_db: track.volume_db, pan: index % 2 === 0 ? -5 : 5, + source_asset: track.track_type === 'vocal' && vocalAsset ? vocalAsset.id : null, organizations: organizationId, }, { currentUser, transaction }, @@ -597,6 +661,8 @@ module.exports = class StudioService { recordingSession, songMetadata, coverArtwork, + vocalAsset, + uploadedVocal, ), arrangementSections: arrangementSections.map((section) => ({ id: section.id, diff --git a/frontend/src/components/Studio/AudioWaveformPreview.tsx b/frontend/src/components/Studio/AudioWaveformPreview.tsx new file mode 100644 index 0000000..ece1053 --- /dev/null +++ b/frontend/src/components/Studio/AudioWaveformPreview.tsx @@ -0,0 +1,139 @@ +import React, { useEffect, useMemo, useState } from 'react'; + +type Props = { + file?: File | null; + audioUrl?: string; + title?: string; + subtitle?: string; + emptyMessage?: string; + isLoading?: boolean; +}; + +const BAR_COUNT = 56; + +function createWaveformBars(channelData: Float32Array) { + const blockSize = Math.max(1, Math.floor(channelData.length / BAR_COUNT)); + const bars: number[] = []; + + for (let index = 0; index < BAR_COUNT; index += 1) { + const start = index * blockSize; + const end = Math.min(channelData.length, start + blockSize); + let sum = 0; + + for (let sampleIndex = start; sampleIndex < end; sampleIndex += 1) { + sum += Math.abs(channelData[sampleIndex]); + } + + const average = end > start ? sum / (end - start) : 0; + bars.push(Math.min(100, Math.max(8, Math.round(average * 280)))); + } + + return bars; +} + +const AudioWaveformPreview = ({ + file, + audioUrl, + title = 'Waveform preview', + subtitle, + emptyMessage = 'Add an audio file to preview its waveform.', + isLoading = false, +}: Props) => { + const [bars, setBars] = useState([]); + const [errorMessage, setErrorMessage] = useState(''); + const previewUrl = useMemo(() => (file ? URL.createObjectURL(file) : audioUrl || ''), [audioUrl, file]); + + useEffect(() => { + return () => { + if (file && previewUrl) { + URL.revokeObjectURL(previewUrl); + } + }; + }, [file, previewUrl]); + + useEffect(() => { + let isActive = true; + let audioContext: AudioContext | null = null; + + const buildWaveform = async () => { + if (!file && !audioUrl) { + setBars([]); + setErrorMessage(''); + return; + } + + try { + setErrorMessage(''); + + const audioBuffer = file ? await file.arrayBuffer() : await fetch(audioUrl as string).then((response) => response.arrayBuffer()); + + if (typeof window === 'undefined' || !window.AudioContext) { + throw new Error('AudioContext is unavailable in this browser.'); + } + + audioContext = new window.AudioContext(); + const decoded = await audioContext.decodeAudioData(audioBuffer.slice(0)); + + if (!isActive) { + return; + } + + setBars(createWaveformBars(decoded.getChannelData(0))); + } catch (error) { + console.error('Failed to render waveform preview:', error); + + if (isActive) { + setBars([]); + setErrorMessage('Waveform preview is unavailable for this file, but the audio upload is still attached.'); + } + } finally { + if (audioContext && audioContext.state !== 'closed') { + audioContext.close().catch(() => null); + } + } + }; + + buildWaveform(); + + return () => { + isActive = false; + + if (audioContext && audioContext.state !== 'closed') { + audioContext.close().catch(() => null); + } + }; + }, [audioUrl, file]); + + return ( +
+
+
+
{title}
+
{subtitle || file?.name || 'Awaiting audio upload'}
+
+
+ {isLoading ? 'Rendering…' : previewUrl ? 'Ready to review' : 'No audio'} +
+
+ +
+ {bars.length ? ( + bars.map((barHeight, index) => ( +
+ )) + ) : ( +
{emptyMessage}
+ )} +
+ + {previewUrl ?
+ ); +}; + +export default AudioWaveformPreview; diff --git a/frontend/src/pages/studio.tsx b/frontend/src/pages/studio.tsx index ac7b8b1..f20818c 100644 --- a/frontend/src/pages/studio.tsx +++ b/frontend/src/pages/studio.tsx @@ -1,9 +1,11 @@ import { mdiAlbum, + mdiAlertCircleOutline, mdiChartTimelineVariant, mdiCheckCircleOutline, mdiChevronRight, mdiClockOutline, + mdiInformationOutline, mdiExportVariant, mdiMicrophone, mdiMusic, @@ -15,7 +17,8 @@ import { import axios from 'axios'; import Head from 'next/head'; import Link from 'next/link'; -import React, { ReactElement, useEffect, useMemo, useState } from 'react'; +import React, { DragEvent, ReactElement, useEffect, useMemo, useRef, useState } from 'react'; +import AudioWaveformPreview from '../components/Studio/AudioWaveformPreview'; import BaseButton from '../components/BaseButton'; import BaseIcon from '../components/BaseIcon'; import CardBox from '../components/CardBox'; @@ -26,6 +29,7 @@ import SectionTitleLineWithButton from '../components/SectionTitleLineWithButton import { getPageTitle } from '../config'; import { hasPermission } from '../helpers/userPermissions'; import LayoutAuthenticated from '../layouts/Authenticated'; +import FileUploader from '../components/Uploaders/UploadService'; import { useAppSelector } from '../stores/hooks'; type GenreOption = { @@ -100,12 +104,24 @@ type RecentSession = { } | null; }; +type UploadedFile = { + id?: string; + name: string; + sizeInBytes?: number; + privateUrl: string; + publicUrl: string; + new: boolean; +}; + type CreatedLink = { id: string; href: string; status?: string; format?: string; title?: string; + name?: string; + fileName?: string; + publicUrl?: string; bpm?: number; key_signature?: string; mood?: string; @@ -119,6 +135,7 @@ type CreatedSession = { song: CreatedLink; generationRequest: CreatedLink; recordingSession?: CreatedLink | null; + vocalAsset?: CreatedLink | null; mixSession: CreatedLink; masteringSession: CreatedLink; exportJob: CreatedLink; @@ -147,6 +164,22 @@ type LaunchpadResponse = { recentSessions: RecentSession[]; }; +type DetectedVocalAnalysis = { + bpm?: number; + bpmConfidence?: number; + key?: string; + keyConfidence?: number; + overallConfidence?: number; +}; + +type SubmittedVocalPreview = { + file?: File | null; + audioUrl?: string; + name: string; + sizeInBytes?: number; + analysis?: DetectedVocalAnalysis | null; +}; + type FormState = { title: string; genreId: string; @@ -196,6 +229,11 @@ const vocalModes: Array<{ value: FormState['vocalMode']; title: string; descript }, ]; +const vocalUploadSchema = { + size: 20 * 1024 * 1024, + formats: ['mp3', 'wav'], +}; + const stageCards = [ { label: 'Beat generation', @@ -214,6 +252,456 @@ const stageCards = [ }, ]; +const NOTE_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']; +const MAJOR_PROFILE = [6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]; +const MINOR_PROFILE = [6.33, 2.68, 3.52, 5.38, 2.6, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]; + +function formatFileSize(sizeInBytes?: number) { + if (!sizeInBytes) { + return ''; + } + + const sizeInKb = sizeInBytes / 1024; + + if (sizeInKb < 1024) { + return `${Math.max(1, Math.round(sizeInKb))} KB`; + } + + return `${(sizeInKb / 1024).toFixed(1)} MB`; +} + +function normalizeTempo(value: number) { + let bpm = value; + + while (bpm < 70) { + bpm *= 2; + } + + while (bpm > 180) { + bpm /= 2; + } + + return Math.round(bpm); +} + +function normalizeConfidence(value?: number) { + if (typeof value !== 'number' || Number.isNaN(value)) { + return 0; + } + + return Math.min(1, Math.max(0, value)); +} + +function getConfidenceLabel(value?: number) { + const confidence = normalizeConfidence(value); + + if (confidence >= 0.78) { + return 'High'; + } + + if (confidence >= 0.5) { + return 'Medium'; + } + + return 'Low'; +} + +function getConfidenceClasses(value?: number) { + const confidence = normalizeConfidence(value); + + if (confidence >= 0.78) { + return 'border-emerald-300/30 bg-emerald-500/10 text-emerald-100'; + } + + if (confidence >= 0.5) { + return 'border-amber-300/30 bg-amber-500/10 text-amber-100'; + } + + return 'border-rose-300/30 bg-rose-500/10 text-rose-100'; +} + +function formatConfidenceValue(value?: number) { + const confidence = normalizeConfidence(value); + + if (!confidence) { + return ''; + } + + return `${Math.round(confidence * 100)}%`; +} + +function formatConfidenceSummary(value?: number) { + const percentage = formatConfidenceValue(value); + + if (!percentage) { + return ''; + } + + return `${getConfidenceLabel(value)} confidence · ${percentage}`; +} + +function getConfidenceTooltip(metricLabel: string, value?: number) { + const label = getConfidenceLabel(value); + const percentage = formatConfidenceValue(value); + const summary = percentage + ? `${metricLabel} is currently ${label.toLowerCase()} confidence (${percentage}).` + : `${metricLabel} confidence is currently unavailable.`; + + if (label === 'High') { + return `${summary} High means the vocal signal looked stable and the detector found a strong match.`; + } + + if (label === 'Medium') { + return `${summary} Medium means the suggestion is usable, but you should still confirm it by ear.`; + } + + return `${summary} Low means the take looks noisy, sparse, or ambiguous, so you should verify it manually.`; +} + +function hasLowConfidence(value?: number) { + const confidence = normalizeConfidence(value); + + return confidence > 0 && confidence < 0.5; +} + +function getLowConfidenceWarning(analysis?: DetectedVocalAnalysis | null, analysisError?: string) { + if (analysisError) { + return 'This vocal take uploaded successfully, but the signal looks noisy or ambiguous enough that automatic BPM/key detection could not lock in cleanly.'; + } + + if (!analysis) { + return ''; + } + + if (hasLowConfidence(analysis.bpmConfidence) && hasLowConfidence(analysis.keyConfidence)) { + return 'Low-confidence BPM and key results usually mean the vocal is noisy, sparse, or too free-form to match reliably. Double-check both by ear before launching.'; + } + + if (hasLowConfidence(analysis.bpmConfidence)) { + return 'The BPM estimate is low confidence, which usually means the vocal rhythm is noisy or loosely phrased. Use tap tempo or set BPM manually before launching.'; + } + + if (hasLowConfidence(analysis.keyConfidence)) { + return 'The key estimate is low confidence, which usually means the pitch center is unclear or the take is noisy. Treat the suggested key as a starting point and verify by ear.'; + } + + if (hasLowConfidence(analysis.overallConfidence)) { + return 'Overall analysis confidence is low, so the uploaded take may be too noisy or ambiguous for strong automatic matching. Review the suggestions carefully.'; + } + + return ''; +} + +function mixToMono(audioBuffer: AudioBuffer) { + const mixed = new Float32Array(audioBuffer.length); + + for (let channelIndex = 0; channelIndex < audioBuffer.numberOfChannels; channelIndex += 1) { + const channelData = audioBuffer.getChannelData(channelIndex); + + for (let sampleIndex = 0; sampleIndex < audioBuffer.length; sampleIndex += 1) { + mixed[sampleIndex] += channelData[sampleIndex] / audioBuffer.numberOfChannels; + } + } + + return mixed; +} + +function downSampleBuffer(samples: Float32Array, sampleRate: number, targetRate = 22050) { + if (sampleRate <= targetRate) { + return { sampleRate, samples }; + } + + const ratio = sampleRate / targetRate; + const newLength = Math.max(1, Math.round(samples.length / ratio)); + const downSampled = new Float32Array(newLength); + + for (let index = 0; index < newLength; index += 1) { + const start = Math.floor(index * ratio); + const end = Math.min(samples.length, Math.floor((index + 1) * ratio)); + let sum = 0; + + for (let sampleIndex = start; sampleIndex < end; sampleIndex += 1) { + sum += samples[sampleIndex]; + } + + downSampled[index] = end > start ? sum / (end - start) : samples[start] || 0; + } + + return { sampleRate: targetRate, samples: downSampled }; +} + +function estimateTempo(samples: Float32Array, sampleRate: number) { + const windowSizes = [1024, 2048, 4096]; + const aggregatedScores = new Map(); + let totalPeaks = 0; + + windowSizes.forEach((windowSize, windowIndex) => { + const energies: number[] = []; + + for (let start = 0; start + windowSize < samples.length; start += windowSize) { + let sum = 0; + + for (let sampleIndex = start; sampleIndex < start + windowSize; sampleIndex += 1) { + sum += Math.abs(samples[sampleIndex]); + } + + energies.push(sum / windowSize); + } + + if (energies.length < 8) { + return; + } + + const meanEnergy = energies.reduce((total, value) => total + value, 0) / energies.length; + const variance = energies.reduce((total, value) => total + (value - meanEnergy) ** 2, 0) / energies.length; + const threshold = meanEnergy + Math.sqrt(variance) * (0.22 + windowIndex * 0.08); + const peaks: number[] = []; + + for (let index = 1; index < energies.length - 1; index += 1) { + if (energies[index] > threshold && energies[index] >= energies[index - 1] && energies[index] >= energies[index + 1]) { + if (!peaks.length || index - peaks[peaks.length - 1] > 1 + windowIndex) { + peaks.push(index); + } + } + } + + if (peaks.length < 2) { + return; + } + + totalPeaks += peaks.length; + const windowWeight = 1.2 - windowIndex * 0.2; + + peaks.forEach((peak, peakIndex) => { + for (let offset = 1; offset <= 8 && peakIndex + offset < peaks.length; offset += 1) { + const interval = peaks[peakIndex + offset] - peak; + + if (!interval) { + continue; + } + + const bpm = normalizeTempo((60 * sampleRate) / (interval * windowSize)); + const intervalWeight = windowWeight / offset; + aggregatedScores.set(bpm, (aggregatedScores.get(bpm) || 0) + intervalWeight); + } + }); + }); + + if (!aggregatedScores.size) { + return null; + } + + const smoothedScores = new Map(); + Array.from(aggregatedScores.entries()).forEach(([bpm, score]) => { + let smoothedScore = score; + + for (let neighbor = bpm - 2; neighbor <= bpm + 2; neighbor += 1) { + if (neighbor === bpm) { + continue; + } + + const neighborScore = aggregatedScores.get(neighbor); + + if (neighborScore) { + smoothedScore += neighborScore * (neighbor === bpm - 1 || neighbor === bpm + 1 ? 0.55 : 0.25); + } + } + + smoothedScores.set(bpm, smoothedScore); + }); + + const rankedTempos = Array.from(smoothedScores.entries()).sort((left, right) => right[1] - left[1]); + const [bestEntry, secondEntry] = rankedTempos; + + if (!bestEntry) { + return null; + } + + const [bpm, bestScore] = bestEntry; + const secondScore = secondEntry?.[1] || 0; + const totalScore = rankedTempos.reduce((total, [, score]) => total + score, 0); + const dominance = totalScore ? bestScore / totalScore : 0; + const separation = bestScore ? (bestScore - secondScore) / bestScore : 0; + const peakCoverage = Math.min(1, totalPeaks / 28); + const confidence = normalizeConfidence(dominance * 0.45 + separation * 0.4 + peakCoverage * 0.15); + + return { bpm, confidence }; +} + +function estimateDominantPitch(segment: Float32Array, sampleRate: number) { + let rms = 0; + + for (let index = 0; index < segment.length; index += 1) { + rms += segment[index] * segment[index]; + } + + rms = Math.sqrt(rms / segment.length); + + if (rms < 0.012) { + return null; + } + + const minLag = Math.floor(sampleRate / 1000); + const maxLag = Math.min(Math.floor(sampleRate / 80), segment.length - 1); + let bestLag = -1; + let bestCorrelation = 0; + let secondCorrelation = 0; + + for (let lag = minLag; lag <= maxLag; lag += 1) { + let correlation = 0; + + for (let index = 0; index + lag < segment.length; index += 1) { + correlation += segment[index] * segment[index + lag]; + } + + const normalizedCorrelation = correlation / Math.max(1, segment.length - lag); + + if (normalizedCorrelation > bestCorrelation) { + secondCorrelation = bestCorrelation; + bestCorrelation = normalizedCorrelation; + bestLag = lag; + } else if (normalizedCorrelation > secondCorrelation) { + secondCorrelation = normalizedCorrelation; + } + } + + if (bestLag <= 0 || bestCorrelation <= 0) { + return null; + } + + return { + frequency: sampleRate / bestLag, + rms, + clarity: normalizeConfidence((bestCorrelation - secondCorrelation) / bestCorrelation), + }; +} + +function scorePitchProfiles(histogram: number[], profile: number[]) { + return NOTE_NAMES.map((_, tonicIndex) => + histogram.reduce((total, value, pitchIndex) => total + value * profile[(pitchIndex - tonicIndex + 12) % 12], 0), + ); +} + +function estimateMusicalKey(samples: Float32Array, sampleRate: number) { + const segmentSizes = [4096, 8192]; + const totalSegments = 24; + const histogram = new Array(12).fill(0) as number[]; + let acceptedSegments = 0; + let clarityTotal = 0; + + segmentSizes.forEach((segmentSize, segmentIndex) => { + if (samples.length < segmentSize) { + return; + } + + const step = Math.max(1, Math.floor((samples.length - segmentSize) / totalSegments)); + + for (let offset = 0; offset + segmentSize <= samples.length; offset += step) { + const segment = samples.slice(offset, offset + segmentSize); + const detectedPitch = estimateDominantPitch(segment, sampleRate); + + if (!detectedPitch || detectedPitch.frequency < 80 || detectedPitch.frequency > 1000) { + continue; + } + + const midi = Math.round(69 + 12 * Math.log2(detectedPitch.frequency / 440)); + const pitchClass = ((midi % 12) + 12) % 12; + const segmentWeight = detectedPitch.rms * (0.75 + detectedPitch.clarity * 0.75) * (segmentIndex === 0 ? 1 : 0.85); + + histogram[pitchClass] += segmentWeight; + acceptedSegments += 1; + clarityTotal += detectedPitch.clarity; + } + }); + + const histogramTotal = histogram.reduce((total, value) => total + value, 0); + + if (!histogramTotal || !acceptedSegments) { + return null; + } + + const normalizedHistogram = histogram.map((value) => value / histogramTotal); + const majorScores = scorePitchProfiles(normalizedHistogram, MAJOR_PROFILE); + const minorScores = scorePitchProfiles(normalizedHistogram, MINOR_PROFILE); + const rankedScores = [ + ...majorScores.map((score, index) => ({ label: `${NOTE_NAMES[index]} major`, score })), + ...minorScores.map((score, index) => ({ label: `${NOTE_NAMES[index]} minor`, score })), + ].sort((left, right) => right.score - left.score); + + const bestMatch = rankedScores[0]; + const runnerUp = rankedScores[1]; + + if (!bestMatch) { + return null; + } + + const margin = bestMatch.score ? (bestMatch.score - (runnerUp?.score || 0)) / bestMatch.score : 0; + const segmentCoverage = Math.min(1, acceptedSegments / 16); + const averageClarity = clarityTotal / acceptedSegments; + const confidence = normalizeConfidence(margin * 0.55 + averageClarity * 0.3 + segmentCoverage * 0.15); + + return { + key: bestMatch.label, + confidence, + }; +} + +async function detectVocalAnalysis(file: File): Promise { + if (typeof window === 'undefined' || !window.AudioContext) { + throw new Error('Audio analysis is unavailable in this browser.'); + } + + const audioBuffer = await file.arrayBuffer(); + const audioContext = new window.AudioContext(); + + try { + const decoded = await audioContext.decodeAudioData(audioBuffer.slice(0)); + const mono = mixToMono(decoded); + const { sampleRate, samples } = downSampleBuffer(mono, decoded.sampleRate); + const tempoMatch = estimateTempo(samples, sampleRate); + const keyMatch = estimateMusicalKey(samples, sampleRate); + const bpm = tempoMatch?.bpm; + const bpmConfidence = tempoMatch?.confidence; + const key = keyMatch?.key; + const keyConfidence = keyMatch?.confidence; + const confidenceValues = [bpmConfidence, keyConfidence].filter((value): value is number => typeof value === 'number'); + + if (!bpm && !key) { + return null; + } + + const overallConfidence = normalizeConfidence( + confidenceValues.reduce((total, value) => total + value, 0) / Math.max(1, confidenceValues.length), + ); + + return { bpm, bpmConfidence, key, keyConfidence, overallConfidence }; + } finally { + if (audioContext.state !== 'closed') { + await audioContext.close(); + } + } +} + +function formatDetectedAnalysis(analysis?: DetectedVocalAnalysis | null) { + if (!analysis) { + return ''; + } + + const parts: string[] = []; + + if (analysis.bpm) { + const confidence = formatConfidenceValue(analysis.bpmConfidence); + parts.push(`${analysis.bpm} BPM${confidence ? ` (${confidence})` : ''}`); + } + + if (analysis.key) { + const confidence = formatConfidenceValue(analysis.keyConfidence); + parts.push(`${analysis.key}${confidence ? ` (${confidence})` : ''}`); + } + + return parts.join(' · '); +} + function getStatusClasses(status?: string) { switch (status) { case 'completed': @@ -266,6 +754,22 @@ const StudioPage = () => { const [aiModels, setAiModels] = useState([]); const [recentSessions, setRecentSessions] = useState([]); const [createdSession, setCreatedSession] = useState(null); + const [uploadedVocal, setUploadedVocal] = useState(null); + const [uploadedVocalFile, setUploadedVocalFile] = useState(null); + const [submittedVocalPreview, setSubmittedVocalPreview] = useState(null); + const [vocalAnalysis, setVocalAnalysis] = useState(null); + const [isUploadingVocal, setIsUploadingVocal] = useState(false); + const [isAnalyzingVocal, setIsAnalyzingVocal] = useState(false); + const [isVocalDragActive, setIsVocalDragActive] = useState(false); + const [vocalUploadError, setVocalUploadError] = useState(''); + const [vocalAnalysisError, setVocalAnalysisError] = useState(''); + const [tapTempoMarks, setTapTempoMarks] = useState([]); + const [tapTempoBpm, setTapTempoBpm] = useState(null); + const [tapTempoStatus, setTapTempoStatus] = useState('Tap 4 to 8 times in rhythm to estimate BPM manually.'); + + const vocalFileInputRef = useRef(null); + const vocalDragDepthRef = useRef(0); + const lastAutoDetectedValuesRef = useRef({ targetBpm: '', targetKey: '' }); const selectedGenre = useMemo( () => genres.find((genre) => genre.id === form.genreId) || null, @@ -277,6 +781,73 @@ const StudioPage = () => { [form.masteringPresetId, masteringPresets], ); + const detectedAnalysisLabel = formatDetectedAnalysis(vocalAnalysis); + const detectedOverallConfidence = formatConfidenceSummary(vocalAnalysis?.overallConfidence); + const hasDetectedSuggestions = Boolean(vocalAnalysis?.bpm || vocalAnalysis?.key); + const canApplyDetectedSuggestions = Boolean( + (vocalAnalysis?.bpm && form.targetBpm !== String(vocalAnalysis.bpm)) || (vocalAnalysis?.key && form.targetKey !== vocalAnalysis.key), + ); + const lowConfidenceWarning = getLowConfidenceWarning(vocalAnalysis, vocalAnalysisError); + const showLowConfidenceWarning = Boolean(form.vocalMode === 'upload' && uploadedVocal && lowConfidenceWarning); + const shouldShowTapTempoAssist = Boolean( + form.vocalMode === 'upload' && uploadedVocal && (vocalAnalysisError || !vocalAnalysis?.bpm || hasLowConfidence(vocalAnalysis?.bpmConfidence)), + ); + const tapTempoCount = tapTempoMarks.length; + + const syncDetectedValuesToForm = (analysis: DetectedVocalAnalysis | null, force = false) => { + if (!analysis) { + return; + } + + const detectedValues = { + targetBpm: analysis.bpm ? String(analysis.bpm) : '', + targetKey: analysis.key || '', + }; + + setForm((current) => { + const next = { ...current }; + + if ( + detectedValues.targetBpm && + (force || !current.targetBpm || current.targetBpm === lastAutoDetectedValuesRef.current.targetBpm) + ) { + next.targetBpm = detectedValues.targetBpm; + } + + if ( + detectedValues.targetKey && + (force || !current.targetKey || current.targetKey === lastAutoDetectedValuesRef.current.targetKey) + ) { + next.targetKey = detectedValues.targetKey; + } + + return next; + }); + + if (detectedValues.targetBpm) { + lastAutoDetectedValuesRef.current.targetBpm = detectedValues.targetBpm; + } + + if (detectedValues.targetKey) { + lastAutoDetectedValuesRef.current.targetKey = detectedValues.targetKey; + } + }; + + const resetVocalPicker = () => { + if (vocalFileInputRef.current) { + vocalFileInputRef.current.value = ''; + } + + vocalDragDepthRef.current = 0; + setIsVocalDragActive(false); + }; + + const resetTapTempo = (message = 'Tap 4 to 8 times in rhythm to estimate BPM manually.') => { + setTapTempoMarks([]); + setTapTempoBpm(null); + setTapTempoStatus(message); + }; + const loadLaunchpad = async () => { const { data } = await axios.get('/studio/launchpad'); @@ -342,15 +913,209 @@ const StudioPage = () => { } as FormState)); }; + const clearUploadedVocal = () => { + setUploadedVocal(null); + setUploadedVocalFile(null); + setVocalAnalysis(null); + setVocalUploadError(''); + setVocalAnalysisError(''); + setIsAnalyzingVocal(false); + resetTapTempo(); + resetVocalPicker(); + }; + + const handleSelectedVocalFile = async (file: File) => { + try { + setIsUploadingVocal(true); + setIsAnalyzingVocal(false); + setVocalUploadError(''); + setVocalAnalysisError(''); + setErrorMessage(''); + resetTapTempo(); + + FileUploader.validate(file, vocalUploadSchema); + const remoteFile = (await FileUploader.upload('assets/file_blobs', file, vocalUploadSchema)) as UploadedFile; + + setUploadedVocal(remoteFile); + setUploadedVocalFile(file); + setVocalAnalysis(null); + + setIsAnalyzingVocal(true); + + try { + const analysis = await detectVocalAnalysis(file); + setVocalAnalysis(analysis); + syncDetectedValuesToForm(analysis); + } catch (error) { + console.error('Failed to analyze uploaded vocal:', error); + setVocalAnalysis(null); + setVocalAnalysisError('The vocal uploaded successfully, but automatic BPM/key detection is unavailable for this take.'); + } finally { + setIsAnalyzingVocal(false); + } + } catch (error) { + console.error('Failed to upload vocal file:', error); + clearUploadedVocal(); + setVocalUploadError(error instanceof Error ? error.message : 'We could not upload the vocal file. Please try again.'); + } finally { + setIsUploadingVocal(false); + resetVocalPicker(); + } + }; + + const handleVocalUpload = async (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + + if (!file) { + return; + } + + await handleSelectedVocalFile(file); + }; + + const handleVocalDragEnter = (event: DragEvent) => { + event.preventDefault(); + event.stopPropagation(); + + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + vocalDragDepthRef.current += 1; + setIsVocalDragActive(true); + }; + + const handleVocalDragOver = (event: DragEvent) => { + event.preventDefault(); + event.stopPropagation(); + + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + event.dataTransfer.dropEffect = 'copy'; + setIsVocalDragActive(true); + }; + + const handleVocalDragLeave = (event: DragEvent) => { + event.preventDefault(); + event.stopPropagation(); + + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + vocalDragDepthRef.current = Math.max(0, vocalDragDepthRef.current - 1); + + if (!vocalDragDepthRef.current) { + setIsVocalDragActive(false); + } + }; + + const handleVocalDrop = async (event: DragEvent) => { + event.preventDefault(); + event.stopPropagation(); + + vocalDragDepthRef.current = 0; + setIsVocalDragActive(false); + + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + const file = event.dataTransfer.files?.[0]; + + if (!file) { + return; + } + + await handleSelectedVocalFile(file); + }; + + const handleTapTempo = () => { + const now = Date.now(); + + setTapTempoMarks((current) => { + const recentMarks = current.filter((timestamp) => now - timestamp <= 8000); + const timedOut = Boolean(recentMarks.length && now - recentMarks[recentMarks.length - 1] > 2500); + const baseMarks = timedOut ? [] : recentMarks; + const nextMarks = [...baseMarks, now].slice(-8); + + if (timedOut) { + setTapTempoBpm(null); + setTapTempoStatus('Tap timing reset after a pause. Start tapping again on the beat.'); + } + + if (nextMarks.length < 4) { + const tapsRemaining = 4 - nextMarks.length; + setTapTempoBpm(null); + setTapTempoStatus( + `Captured ${nextMarks.length} tap${nextMarks.length === 1 ? '' : 's'}. Tap ${tapsRemaining} more time${tapsRemaining === 1 ? '' : 's'} to estimate BPM.`, + ); + return nextMarks; + } + + const intervals = nextMarks + .slice(1) + .map((timestamp, index) => timestamp - nextMarks[index]) + .filter((interval) => interval > 0 && interval < 2000); + + if (!intervals.length) { + setTapTempoBpm(null); + setTapTempoStatus('Those taps were too uneven to estimate BPM. Try tapping a steady pulse.'); + return nextMarks; + } + + const averageInterval = intervals.reduce((total, interval) => total + interval, 0) / intervals.length; + const bpm = normalizeTempo(60000 / averageInterval); + const bpmValue = String(bpm); + + setTapTempoBpm(bpm); + setTapTempoStatus(`Tap tempo estimated ${bpm} BPM from ${nextMarks.length} taps. Keep tapping to refine it, or keep this value.`); + setForm((currentForm) => ({ + ...currentForm, + targetBpm: bpmValue, + })); + lastAutoDetectedValuesRef.current.targetBpm = bpmValue; + + return nextMarks; + }); + }; + + const handleResetTapTempo = () => { + resetTapTempo(); + }; + const handleSubmit = async (event: React.FormEvent) => { event.preventDefault(); + if (form.vocalMode === 'upload' && !uploadedVocal) { + setErrorMessage('Upload an MP3 or WAV vocal take before launching the session.'); + return; + } + try { setIsSubmitting(true); setErrorMessage(''); setSuccessMessage(''); - const { data } = await axios.post<{ message: string; session: CreatedSession }>('/studio/launchpad', form); + const { data } = await axios.post<{ message: string; session: CreatedSession }>('/studio/launchpad', { + ...form, + vocalUpload: uploadedVocal, + }); + + if (form.vocalMode === 'upload' && uploadedVocal) { + setSubmittedVocalPreview({ + file: uploadedVocalFile, + audioUrl: uploadedVocal.publicUrl, + name: uploadedVocal.name, + sizeInBytes: uploadedVocal.sizeInBytes, + analysis: vocalAnalysis, + }); + } else { + setSubmittedVocalPreview(null); + } + setCreatedSession(data.session); setSuccessMessage(data.message); setForm((current) => ({ @@ -359,7 +1124,9 @@ const StudioPage = () => { languageId: current.languageId, masteringPresetId: current.masteringPresetId, targetBpm: current.targetBpm, + targetKey: current.targetKey, })); + clearUploadedVocal(); await loadLaunchpad(); } catch (error) { console.error('Failed to create studio session:', error); @@ -508,6 +1275,114 @@ const StudioPage = () => {
+ + {form.vocalMode === 'upload' ? ( + +
+ + +
{ + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + vocalFileInputRef.current?.click(); + }} + onKeyDown={(event) => { + if (event.key === 'Enter' || event.key === ' ') { + event.preventDefault(); + + if (!canCreateProjects || isSubmitting || isUploadingVocal) { + return; + } + + vocalFileInputRef.current?.click(); + } + }} + onDragEnter={handleVocalDragEnter} + onDragOver={handleVocalDragOver} + onDragLeave={handleVocalDragLeave} + onDrop={handleVocalDrop} + className={`rounded-3xl border border-dashed p-5 transition ${ + !canCreateProjects || isSubmitting + ? 'cursor-not-allowed border-white/10 bg-slate-950/40 text-slate-500' + : isVocalDragActive + ? 'cursor-pointer border-emerald-300/60 bg-emerald-500/10 shadow-lg shadow-emerald-900/20' + : 'cursor-pointer border-violet-300/20 bg-slate-950/50 hover:border-violet-300/40 hover:bg-slate-900/80' + }`} + > +
+
+
Drag & drop vocal intake
+
+ {isUploadingVocal ? 'Uploading vocal…' : uploadedVocal ? 'Replace the current vocal take' : 'Drop an MP3/WAV here or click to browse'} +
+

+ Upload starts immediately, then the browser suggests BPM and key from the take so you can match the beat faster. +

+
+ +
+ {isVocalDragActive ? 'Release to upload' : uploadedVocal ? 'Take attached' : 'Awaiting file'} +
+
+ +
+ + {isUploadingVocal ? 'Uploading vocal…' : uploadedVocal ? 'Choose another file' : 'Browse files'} + + + {uploadedVocal ? ( + + ) : null} + +
+ {uploadedVocal ? 'Waveform ready · asset will attach on launch' : 'No vocal take attached yet'} +
+
+
+ + {vocalUploadError ?

{vocalUploadError}

: null} + {vocalAnalysisError ?

{vocalAnalysisError}

: null} + +
+ +
+
+
+ ) : null}
{
+ {shouldShowTapTempoAssist ? ( +
+
+
+
Manual BPM assist
+

+ Automatic BPM confidence is low for this take, so you can tap the pulse manually to lock in a steadier tempo. +

+
+ +
+ + +
+
+ +
+ {tapTempoCount || 0} taps captured + {tapTempoBpm ? ( + Tap BPM {tapTempoBpm} + ) : null} +
+ +

{tapTempoStatus}

+
+ ) : null} + + {form.vocalMode === 'upload' ? ( +
+
+
+
+ Auto-detected from vocal + + + Confidence guide + +
+

+ {isAnalyzingVocal + ? 'Analyzing the uploaded take for BPM and key suggestions…' + : detectedAnalysisLabel + ? `Suggested values: ${detectedAnalysisLabel}${detectedOverallConfidence ? ` · ${detectedOverallConfidence}` : ''}` + : 'Upload a vocal take to auto-fill BPM and key suggestions here.'} +

+
+ + {hasDetectedSuggestions && canApplyDetectedSuggestions ? ( + + ) : null} +
+ + {hasDetectedSuggestions ? ( +
+ {vocalAnalysis?.bpm ? ( + + BPM {vocalAnalysis.bpm} · {formatConfidenceSummary(vocalAnalysis.bpmConfidence) || getConfidenceLabel(vocalAnalysis.bpmConfidence)} + + ) : null} + {vocalAnalysis?.key ? ( + + Key {vocalAnalysis.key} · {formatConfidenceSummary(vocalAnalysis.keyConfidence) || getConfidenceLabel(vocalAnalysis.keyConfidence)} + + ) : null} + {detectedOverallConfidence ? ( + + Overall {detectedOverallConfidence} + + ) : null} +
+ ) : null} + + {showLowConfidenceWarning ? ( +
+
+ +
+
Low-confidence vocal warning
+

{lowConfidenceWarning}

+
+
+
+ ) : null} +
+ ) : null} +