In [1]:
%matplotlib inline
import seaborn
import numpy, scipy, scipy.spatial, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display
plt.rcParams['figure.figsize'] = (9, 7)

Dynamic Time Warping Example¶

Load two audio files:

In [2]:
x1, sr1 = librosa.load('audio/sir_duke_oscar_dub_fast.mp3')
x2, sr2 = librosa.load('audio/sir_duke_oscar_dub_slow.mp3')
In [3]:
print sr1, sr2
22050 22050

Listen:

In [4]:
ipd.Audio(x1, rate=sr1)
Out[4]:
In [5]:
ipd.Audio(x2, rate=sr2)
Out[5]:

Compute chromagrams:

In [6]:
hop_length = 256
In [7]:
C1_cens = librosa.feature.chroma_cens(x1, sr=sr1, hop_length=hop_length)
C2_cens = librosa.feature.chroma_cens(x2, sr=sr2, hop_length=hop_length)
In [8]:
print C1_cens.shape
print C2_cens.shape
(12, 669)
(12, 856)

Compute CQT only for visualization:

In [9]:
C1_cqt = librosa.cqt(x1, sr=sr1, hop_length=hop_length)
C2_cqt = librosa.cqt(x2, sr=sr2, hop_length=hop_length)
In [10]:
C1_cqt_mag = librosa.amplitude_to_db(C1_cqt)
C2_cqt_mag = librosa.amplitude_to_db(C2_cqt)

Define DTW functions:

In [11]:
def dtw_table(x, y, distance=None):
    if distance is None:
        distance = scipy.spatial.distance.euclidean
    nx = len(x)
    ny = len(y)
    table = [[0 for _ in range(ny+1)] for _ in range(nx+1)]
    
    # Compute left column separately, i.e. j=0.
    for i in range(1, nx+1):
        table[i][0] = numpy.inf
        
    # Compute top row separately, i.e. i=0.
    for j in range(1, ny+1):
        table[0][j] = numpy.inf
        
    # Fill in the rest.
    for i in range(1, nx+1):
        for j in range(1, ny+1):
            d = distance(x[i-1], y[j-1])
            table[i][j] = d + min(table[i-1][j], table[i][j-1], table[i-1][j-1])
    return numpy.array(table)
In [12]:
def dtw(x, y, table):
    i = len(x)
    j = len(y)
    path = [(i, j)]
    while i > 0 or j > 0:
        minval = numpy.inf
        if table[i-1][j] < minval:
            minval = table[i-1][j]
            step = (i-1, j)
        if table[i][j-1] < minval:
            minval = table[i][j-1]
            step = (i, j-1)
        if table[i-1][j-1] < minval:
            minval = table[i-1][j-1]
            step = (i-1, j-1)
        path.insert(0, step)
        i, j = step
    return numpy.array(path)

Run DTW:

In [13]:
D = dtw_table(C1_cens.T, C2_cens.T, distance=scipy.spatial.distance.cityblock)
In [14]:
path = dtw(C1_cens.T, C2_cens.T, D)

Listen to the both recordings at the same alignment marker:

In [15]:
path.shape
Out[15]:
(870, 2)
In [16]:
i1, i2 = librosa.frames_to_samples(path[300], hop_length=hop_length)
In [17]:
print i1, i2
55040 76800
In [18]:
ipd.Audio(x1[i1:], rate=sr1)
Out[18]:
In [19]:
ipd.Audio(x2[i2:], rate=sr2)
Out[19]:

Visualize both signals and their alignment:

In [20]:
plt.figure(figsize=(9, 8))

# Top left plot.
ax2 = plt.axes([0, 0.2, 0.20, 0.8])
ax2.imshow(C2_cqt_mag.T[:,::-1], origin='lower', aspect='auto', cmap='coolwarm')
ax2.set_ylabel('Signal 2')
ax2.set_xticks([])
ax2.set_yticks([])
ax2.set_ylim(20)

# Bottom right plot.
ax1 = plt.axes([0.2, 0, 0.8, 0.20])
ax1.imshow(C1_cqt_mag, origin='lower', aspect='auto', cmap='coolwarm')
ax1.set_xlabel('Signal 1')
ax1.set_xticks([])
ax1.set_yticks([])
ax1.set_ylim(20)

# Top right plot.
ax3 = plt.axes([0.2, 0.2, 0.8, 0.8])
ax3.imshow(D.T, aspect='auto', origin='lower', interpolation='nearest', cmap='gray')
ax3.set_xticks([])
ax3.set_yticks([])

# Path.
ax3.plot(path[:,0], path[:,1], 'r')
Out[20]:
[<matplotlib.lines.Line2D at 0x11c4de110>]
In [21]:
plt.figure(figsize=(11, 5))

# Top plot.
ax1 = plt.axes([0, 0.60, 1, 0.40])
ax1.imshow(C1_cqt_mag, origin='lower', aspect='auto', cmap='coolwarm')
ax1.set_ylabel('Signal 1')
ax1.set_xticks([])
ax1.set_yticks([])
ax1.set_ylim(20)

# Bottom plot.
ax2 = plt.axes([0, 0, 1, 0.40])
ax2.imshow(C2_cqt_mag, origin='lower', aspect='auto', cmap='coolwarm')
ax2.set_ylabel('Signal 2')
ax2.set_xticks([])
ax2.set_yticks([])
ax2.set_ylim(20)

# Middle plot.
line_color = 'k'
step = 30
n1 = float(C1_cqt.shape[1])
n2 = float(C2_cqt.shape[1])
ax3 = plt.axes([0, 0.40, 1, 0.20])
for t in path[::step]:
    ax3.plot((t[0]/n1, t[1]/n2), (1, -1), color=line_color)
    ax3.set_xlim(0, 1)
    ax3.set_ylim(-1, 1)
    
# Path markers on top and bottom plot.
y1_min, y1_max = ax1.get_ylim()
y2_min, y2_max = ax2.get_ylim()
ax1.vlines([t[0] for t in path[::step]], y1_min, y1_max, color=line_color)
ax2.vlines([t[1] for t in path[::step]], y2_min, y2_max, color=line_color)
ax3.set_xticks([])
ax3.set_yticks([])
Out[21]:
[]

Exercises¶

Try each pair of audio files among the following:

In [22]:
ls audio/sir_duke*
audio/sir_duke_oscar_dub_fast.mp3     audio/sir_duke_stefan_balke_fast.mp3
audio/sir_duke_oscar_dub_slow.mp3     audio/sir_duke_stefan_balke_slow.mp3

Try adjusting the hop length, distance metric, and feature space.

Instead of chroma_cens, try chroma_stft or librosa.feature.mfcc.

Try magnitude scaling the feature matrices, e.g. amplitude_to_db or $\log(1 + \lambda x)$.