mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 18:09:22 +08:00
Merge pull request #626 from b3by/feature/process_chunks_in_seconds
Use second coordinates for audio concatenation in collect_chunks and drop_chunks
This commit is contained in:
@@ -490,18 +490,104 @@ class VADIterator:
|
|||||||
|
|
||||||
|
|
||||||
def collect_chunks(tss: List[dict],
|
def collect_chunks(tss: List[dict],
|
||||||
wav: torch.Tensor):
|
wav: torch.Tensor,
|
||||||
chunks = []
|
seconds: bool = False,
|
||||||
for i in tss:
|
sampling_rate: int = None) -> torch.Tensor:
|
||||||
chunks.append(wav[i['start']: i['end']])
|
"""Collect audio chunks from a longer audio clip
|
||||||
|
|
||||||
|
This method extracts audio chunks from an audio clip, using a list of
|
||||||
|
provided coordinates, and concatenates them together. Coordinates can be
|
||||||
|
passed either as sample numbers or in seconds, in which case the audio
|
||||||
|
sampling rate is also needed.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tss: List[dict]
|
||||||
|
Coordinate list of the clips to collect from the audio.
|
||||||
|
wav: torch.Tensor, one dimensional
|
||||||
|
One dimensional float torch.Tensor, containing the audio to clip.
|
||||||
|
seconds: bool (default - False)
|
||||||
|
Whether input coordinates are passed as seconds or samples.
|
||||||
|
sampling_rate: int (default - None)
|
||||||
|
Input audio sampling rate. Required if seconds is True.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
torch.Tensor, one dimensional
|
||||||
|
One dimensional float torch.Tensor of the concatenated clipped audio
|
||||||
|
chunks.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
Raised if sampling_rate is not provided when seconds is True.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if seconds and not sampling_rate:
|
||||||
|
raise ValueError('sampling_rate must be provided when seconds is True')
|
||||||
|
|
||||||
|
chunks = list()
|
||||||
|
_tss = _seconds_to_samples_tss(tss, sampling_rate) if seconds else tss
|
||||||
|
|
||||||
|
for i in _tss:
|
||||||
|
chunks.append(wav[i['start']:i['end']])
|
||||||
|
|
||||||
return torch.cat(chunks)
|
return torch.cat(chunks)
|
||||||
|
|
||||||
|
|
||||||
def drop_chunks(tss: List[dict],
|
def drop_chunks(tss: List[dict],
|
||||||
wav: torch.Tensor):
|
wav: torch.Tensor,
|
||||||
chunks = []
|
seconds: bool = False,
|
||||||
|
sampling_rate: int = None) -> torch.Tensor:
|
||||||
|
"""Drop audio chunks from a longer audio clip
|
||||||
|
|
||||||
|
This method extracts audio chunks from an audio clip, using a list of
|
||||||
|
provided coordinates, and drops them. Coordinates can be passed either as
|
||||||
|
sample numbers or in seconds, in which case the audio sampling rate is also
|
||||||
|
needed.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tss: List[dict]
|
||||||
|
Coordinate list of the clips to drop from from the audio.
|
||||||
|
wav: torch.Tensor, one dimensional
|
||||||
|
One dimensional float torch.Tensor, containing the audio to clip.
|
||||||
|
seconds: bool (default - False)
|
||||||
|
Whether input coordinates are passed as seconds or samples.
|
||||||
|
sampling_rate: int (default - None)
|
||||||
|
Input audio sampling rate. Required if seconds is True.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
torch.Tensor, one dimensional
|
||||||
|
One dimensional float torch.Tensor of the input audio minus the dropped
|
||||||
|
chunks.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
Raised if sampling_rate is not provided when seconds is True.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if seconds and not sampling_rate:
|
||||||
|
raise ValueError('sampling_rate must be provided when seconds is True')
|
||||||
|
|
||||||
|
chunks = list()
|
||||||
cur_start = 0
|
cur_start = 0
|
||||||
for i in tss:
|
|
||||||
|
_tss = _seconds_to_samples_tss(tss, sampling_rate) if seconds else tss
|
||||||
|
|
||||||
|
for i in _tss:
|
||||||
chunks.append((wav[cur_start: i['start']]))
|
chunks.append((wav[cur_start: i['start']]))
|
||||||
cur_start = i['end']
|
cur_start = i['end']
|
||||||
|
|
||||||
return torch.cat(chunks)
|
return torch.cat(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def _seconds_to_samples_tss(tss: List[dict], sampling_rate: int) -> List[dict]:
|
||||||
|
"""Convert coordinates expressed in seconds to sample coordinates.
|
||||||
|
"""
|
||||||
|
return [{
|
||||||
|
'start': round(crd['start']) * sampling_rate,
|
||||||
|
'end': round(crd['end']) * sampling_rate
|
||||||
|
} for crd in tss]
|
||||||
|
|||||||
Reference in New Issue
Block a user