233 lines
8.4 KiB
Python
233 lines
8.4 KiB
Python
|
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Audio summaries and TensorFlow operations to create them.
|
||
|
|
||
|
An audio summary stores a rank-2 string tensor of shape `[k, 2]`, where
|
||
|
`k` is the number of audio clips recorded in the summary. Each row of
|
||
|
the tensor is a pair `[encoded_audio, label]`, where `encoded_audio` is
|
||
|
a binary string whose encoding is specified in the summary metadata, and
|
||
|
`label` is a UTF-8 encoded Markdown string describing the audio clip.
|
||
|
|
||
|
NOTE: This module is in beta, and its API is subject to change, but the
|
||
|
data that it stores to disk will be supported forever.
|
||
|
"""
|
||
|
|
||
|
|
||
|
import functools
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from tensorboard.util import encoder as encoder_util
|
||
|
from tensorboard.plugins.audio import metadata
|
||
|
from tensorboard.plugins.audio import summary_v2
|
||
|
|
||
|
|
||
|
# Export V2 versions.
|
||
|
audio = summary_v2.audio
|
||
|
|
||
|
|
||
|
_LABELS_WARNING = (
|
||
|
"Labels on audio summaries are deprecated and will be removed. "
|
||
|
"See <https://github.com/tensorflow/tensorboard/issues/3513>."
|
||
|
)
|
||
|
|
||
|
|
||
|
def op(
|
||
|
name,
|
||
|
audio,
|
||
|
sample_rate,
|
||
|
labels=None,
|
||
|
max_outputs=3,
|
||
|
encoding=None,
|
||
|
display_name=None,
|
||
|
description=None,
|
||
|
collections=None,
|
||
|
):
|
||
|
"""Create a legacy audio summary op for use in a TensorFlow graph.
|
||
|
|
||
|
Arguments:
|
||
|
name: A unique name for the generated summary node.
|
||
|
audio: A `Tensor` representing audio data with shape `[k, t, c]`,
|
||
|
where `k` is the number of audio clips, `t` is the number of
|
||
|
frames, and `c` is the number of channels. Elements should be
|
||
|
floating-point values in `[-1.0, 1.0]`. Any of the dimensions may
|
||
|
be statically unknown (i.e., `None`).
|
||
|
sample_rate: An `int` or rank-0 `int32` `Tensor` that represents the
|
||
|
sample rate, in Hz. Must be positive.
|
||
|
labels: Deprecated. Do not set.
|
||
|
max_outputs: Optional `int` or rank-0 integer `Tensor`. At most this
|
||
|
many audio clips will be emitted at each step. When more than
|
||
|
`max_outputs` many clips are provided, the first `max_outputs`
|
||
|
many clips will be used and the rest silently discarded.
|
||
|
encoding: A constant `str` (not string tensor) indicating the
|
||
|
desired encoding. You can choose any format you like, as long as
|
||
|
it's "wav". Please see the "API compatibility note" below.
|
||
|
display_name: Optional name for this summary in TensorBoard, as a
|
||
|
constant `str`. Defaults to `name`.
|
||
|
description: Optional long-form description for this summary, as a
|
||
|
constant `str`. Markdown is supported. Defaults to empty.
|
||
|
collections: Optional list of graph collections keys. The new
|
||
|
summary op is added to these collections. Defaults to
|
||
|
`[Graph Keys.SUMMARIES]`.
|
||
|
|
||
|
Returns:
|
||
|
A TensorFlow summary op.
|
||
|
|
||
|
API compatibility note: The default value of the `encoding`
|
||
|
argument is _not_ guaranteed to remain unchanged across TensorBoard
|
||
|
versions. In the future, we will by default encode as FLAC instead of
|
||
|
as WAV. If the specific format is important to you, please provide a
|
||
|
file format explicitly.
|
||
|
"""
|
||
|
if labels is not None:
|
||
|
warnings.warn(_LABELS_WARNING)
|
||
|
|
||
|
# TODO(nickfelt): remove on-demand imports once dep situation is fixed.
|
||
|
import tensorflow.compat.v1 as tf
|
||
|
|
||
|
if display_name is None:
|
||
|
display_name = name
|
||
|
if encoding is None:
|
||
|
encoding = "wav"
|
||
|
|
||
|
if encoding == "wav":
|
||
|
encoding = metadata.Encoding.Value("WAV")
|
||
|
encoder = functools.partial(
|
||
|
tf.audio.encode_wav, sample_rate=sample_rate
|
||
|
)
|
||
|
else:
|
||
|
raise ValueError("Unknown encoding: %r" % encoding)
|
||
|
|
||
|
with tf.name_scope(name), tf.control_dependencies(
|
||
|
[tf.assert_rank(audio, 3)]
|
||
|
):
|
||
|
limited_audio = audio[:max_outputs]
|
||
|
encoded_audio = tf.map_fn(
|
||
|
encoder, limited_audio, dtype=tf.string, name="encode_each_audio"
|
||
|
)
|
||
|
if labels is None:
|
||
|
limited_labels = tf.tile([""], tf.shape(input=limited_audio)[:1])
|
||
|
else:
|
||
|
limited_labels = labels[:max_outputs]
|
||
|
tensor = tf.transpose(a=tf.stack([encoded_audio, limited_labels]))
|
||
|
summary_metadata = metadata.create_summary_metadata(
|
||
|
display_name=display_name,
|
||
|
description=description,
|
||
|
encoding=encoding,
|
||
|
)
|
||
|
return tf.summary.tensor_summary(
|
||
|
name="audio_summary",
|
||
|
tensor=tensor,
|
||
|
collections=collections,
|
||
|
summary_metadata=summary_metadata,
|
||
|
)
|
||
|
|
||
|
|
||
|
def pb(
|
||
|
name,
|
||
|
audio,
|
||
|
sample_rate,
|
||
|
labels=None,
|
||
|
max_outputs=3,
|
||
|
encoding=None,
|
||
|
display_name=None,
|
||
|
description=None,
|
||
|
):
|
||
|
"""Create a legacy audio summary protobuf.
|
||
|
|
||
|
This behaves as if you were to create an `op` with the same arguments
|
||
|
(wrapped with constant tensors where appropriate) and then execute
|
||
|
that summary op in a TensorFlow session.
|
||
|
|
||
|
Arguments:
|
||
|
name: A unique name for the generated summary node.
|
||
|
audio: An `np.array` representing audio data with shape `[k, t, c]`,
|
||
|
where `k` is the number of audio clips, `t` is the number of
|
||
|
frames, and `c` is the number of channels. Elements should be
|
||
|
floating-point values in `[-1.0, 1.0]`.
|
||
|
sample_rate: An `int` that represents the sample rate, in Hz.
|
||
|
Must be positive.
|
||
|
labels: Deprecated. Do not set.
|
||
|
max_outputs: Optional `int`. At most this many audio clips will be
|
||
|
emitted. When more than `max_outputs` many clips are provided, the
|
||
|
first `max_outputs` many clips will be used and the rest silently
|
||
|
discarded.
|
||
|
encoding: A constant `str` indicating the desired encoding. You
|
||
|
can choose any format you like, as long as it's "wav". Please see
|
||
|
the "API compatibility note" below.
|
||
|
display_name: Optional name for this summary in TensorBoard, as a
|
||
|
`str`. Defaults to `name`.
|
||
|
description: Optional long-form description for this summary, as a
|
||
|
`str`. Markdown is supported. Defaults to empty.
|
||
|
|
||
|
Returns:
|
||
|
A `tf.Summary` protobuf object.
|
||
|
|
||
|
API compatibility note: The default value of the `encoding`
|
||
|
argument is _not_ guaranteed to remain unchanged across TensorBoard
|
||
|
versions. In the future, we will by default encode as FLAC instead of
|
||
|
as WAV. If the specific format is important to you, please provide a
|
||
|
file format explicitly.
|
||
|
"""
|
||
|
if labels is not None:
|
||
|
warnings.warn(_LABELS_WARNING)
|
||
|
|
||
|
# TODO(nickfelt): remove on-demand imports once dep situation is fixed.
|
||
|
import tensorflow.compat.v1 as tf
|
||
|
|
||
|
audio = np.array(audio)
|
||
|
if audio.ndim != 3:
|
||
|
raise ValueError("Shape %r must have rank 3" % (audio.shape,))
|
||
|
if encoding is None:
|
||
|
encoding = "wav"
|
||
|
|
||
|
if encoding == "wav":
|
||
|
encoding = metadata.Encoding.Value("WAV")
|
||
|
encoder = functools.partial(
|
||
|
encoder_util.encode_wav, samples_per_second=sample_rate
|
||
|
)
|
||
|
else:
|
||
|
raise ValueError("Unknown encoding: %r" % encoding)
|
||
|
|
||
|
limited_audio = audio[:max_outputs]
|
||
|
if labels is None:
|
||
|
limited_labels = [b""] * len(limited_audio)
|
||
|
else:
|
||
|
limited_labels = [
|
||
|
tf.compat.as_bytes(label) for label in labels[:max_outputs]
|
||
|
]
|
||
|
|
||
|
encoded_audio = [encoder(a) for a in limited_audio]
|
||
|
content = np.array([encoded_audio, limited_labels]).transpose()
|
||
|
tensor = tf.make_tensor_proto(content, dtype=tf.string)
|
||
|
|
||
|
if display_name is None:
|
||
|
display_name = name
|
||
|
summary_metadata = metadata.create_summary_metadata(
|
||
|
display_name=display_name, description=description, encoding=encoding
|
||
|
)
|
||
|
tf_summary_metadata = tf.SummaryMetadata.FromString(
|
||
|
summary_metadata.SerializeToString()
|
||
|
)
|
||
|
|
||
|
summary = tf.Summary()
|
||
|
summary.value.add(
|
||
|
tag="%s/audio_summary" % name,
|
||
|
metadata=tf_summary_metadata,
|
||
|
tensor=tensor,
|
||
|
)
|
||
|
return summary
|