Peter MaBrian Cottrell
Created January 9, 2020 © GPL3+

Jetson Spectrogram

Using Tensorflow AI to turn wave into spectrogram

Jetson Spectrogram

Things used in this project

Hardware components

NVIDIA Jetson Nano Developer Kit
7 Inch LCD Cape for Beagle Bone Black - Touch Display
Seeed 7 Inch LCD Cape for Beagle Bone Black - Touch Display

Software apps and online services

NVIDIA Jetpack

Hand tools and fabrication machines

3D Printer (generic)
3D Printer (generic)


import tensorflow as tf
# FIXME: audio_ops.decode_wav is deprecated, use tensorflow_io.IOTensor.from_audio
from tensorflow.contrib.framework.python.ops import audio_ops

# Enable eager execution for a more interactive frontend.
# If using the default graph mode, you'll probably need to run in a session.

def audio_to_spectrogram(
    """Decode and build a spectrogram using a wav string tensor.

      audio_contents: String tensor of the wav audio contents.
      width: Spectrogram width.
      height: Spectrogram height.
      channels: Audio channel count.
      window_size: Size of the spectrogram window.
      stride: Size of the spectrogram stride.
      brightness: Brightness of the spectrogram.

      0-D string Tensor with the image contents.
    # Decode the wav mono into a 2D tensor with time in dimension 0
    # and channel along dimension 1
    waveform = audio_ops.decode_wav(audio_contents, desired_channels=channels)
    # Compute the spectrogram
    # FIXME: Seems like this is deprecated in tensorflow 2.0 and
    # the operation only works on CPU. Change this to tf.signal.stft 
    # and  friends to take advantage of GPU kernels.
    spectrogram = audio_ops.audio_spectrogram(,

    # Adjust brightness
    brightness = tf.constant(brightness)

    # Normalize pixels
    mul = tf.multiply(spectrogram, brightness)
    min_const = tf.constant(255.)
    minimum = tf.minimum(mul, min_const)

    # Expand dims so we get the proper shape
    expand_dims = tf.expand_dims(minimum, -1)

    # Resize the spectrogram to input size of the model
    resize = tf.image.resize(expand_dims, [width, height])

    # Remove the trailing dimension
    squeeze = tf.squeeze(resize, 0)

    # Tensorflow spectrogram has time along y axis and frequencies along x axis
    # so we fix that
    flip_left_right = tf.image.flip_left_right(squeeze)
    transposed = tf.image.transpose(flip_left_right)

    # Cast to uint8 and encode as png
    cast = tf.cast(transposed, tf.uint8)

    # Encode tensor as a png image
    return tf.image.encode_png(cast)

if __name__ == '__main__':
    input_file = tf.constant('record.wav')
    output_file = tf.constant('spectrogram.png')

    # Generage the spectrogram
    audio =
    image = audio_to_spectrogram(audio, 224, 224)

    # Write the png encoded image to a file, image)

mp3 to wave

from os import path
from pydub import AudioSegment

# files                                                                         
src = "src.mp3"
dst = "dest.wav"

# convert wav to mp3                                                            
sound = AudioSegment.from_mp3(src)
sound.export(dst, format="wav")


Peter Ma

16 projects • 307 followers
Prototype Hacker, Hackathon Goer, World Traveler, Ecological balancer, integrationist, technologist, futurist.
Brian Cottrell

3 projects • 11 followers
I am a software developer with a background in physics and low level programming and I am currently focused on web and mobile development.