lizhigong-VDC
/
VoiceToImg.py

'''
第一步 把wav数据集放在DataSetPath文件夹下 并编号 从0001开始
第二步 运行ChangeWavToImg将wav文件转换成语谱图
'''
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
import random

#数据集存放的目录
DataSetPath = 'DataSet'
#每个分类产生的图像最大个数
maxClassImgCount = 600

'''
把声波序列生成对应的语谱图并保存
groupNp 声波序列
NFFT
framerate 采样率
framesize 抽样数
overlapSize 帧移数量
exportFile 保存文件名
index 图片索引 最终保存 exportFile + index + .png
'''
def ExportImg(groupNp, NFFT, framerate, framesize, overlapSize, exportFile, index):
    spectrum,freqs,ts,fig = plt.specgram(groupNp,
                                         NFFT = NFFT,
                                         Fs = framerate,
                                         window=np.hanning(M = framesize),
                                         noverlap=overlapSize,
                                         mode='default',
                                         scale_by_freq=True,
                                         sides='onesided',
                                         scale='dB',
                                         xextent=None)#绘制频谱图

    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0)
    plt.margins(0,0)
    plt.axis('off')
    plt.savefig("tmp.png")
    '''
    plt.ylabel('Frequency')
    plt.xlabel('Time')
    plt.title("Spectrogram")
    plt.savefig("tmp1.png")
    '''
    plt.show()
    src = cv2.imread("tmp.png")
    reshape = np.shape(spectrum)
    imageChange = cv2.resize(src, (reshape[1], reshape[0]))
    #imageChange = cv2.cvtColor(imageChange, cv2.COLOR_RGB2GRAY)
    #rgb  rg=黄色  判断r b的大小来区分前景背景
    cv2.imencode('.png', imageChange)[1].tofile(exportFile + str(index) + '.png')

'''
wav文件生成语谱图
对wav数据筛选，对声音强度高的1.5S截取生成声纹图  对强度低的 持续1.5S截取生成背景噪音语谱图
nplist wav序列
exportFile 声纹图目录
index 声纹图索引
backFile 背景语谱图目录
backIndex 背景谱图索引
framelength 帧时长 默认 0.025秒
framerate wav采样率 默认11025
'''
def WavToImg(nplist, exportFile, index, backFile, backIndex, framelength = 0.025, framerate = 11025):
    framesize = framelength*framerate #每帧点数 N = t*fs,通常情况下值为256或512,要与NFFT相等\
                                    #而NFFT最好取2的整数次方,即framesize最好取的整数次方

    #找到与当前framesize最接近的2的正整数次方
    nfftdict = {}
    lists = [32,64,128,256,512,1024]
    for i in lists:
        nfftdict[i] = abs(framesize - i)
    sortlist = sorted(nfftdict.items(), key=lambda x: x[1])#按与当前framesize差值升序排列
    framesize = int(sortlist[0][0])#取最接近当前framesize的那个2的正整数次方值为新的framesize

    NFFT = framesize #NFFT必须与时域的点数framsize相等，即不补零的FFT
    overlapSize = 1.0/2 * framesize #重叠部分采样点数overlapSize约为每帧点数的1/3~1/2
    overlapSize = int(round(overlapSize))#取整
    print("帧长为{},帧叠为{},傅里叶变换点数为{}".format(framesize,overlapSize,NFFT))
    groupFrameCount = 6 #1S
    for i in range(framesize // 64):
        listsub = nplist[i * 64 : ]

        #每帧的最大值
        listMax = []
        for i in range(len(listsub) // framesize):
            listMax.append(np.max(listsub[i * framesize : (i + 1) * framesize]))
        #整段最大值
        totalMax = np.max(listMax)
        #归一化
        listMax = listMax / totalMax
        #10帧一组 最大值超过0.1的个数比率
        listRate = []
        for l in range(len(listMax) // 10 - 1):
            count = 0
            for j in range(10): #2S
                if listMax[l * 10 + j] > 0.1:
                    count += 1
            listRate.append(count / 10.0)
        inGroup = 0
        for j in range(len(listRate) - groupFrameCount):
            if inGroup > 0:
                inGroup -= 1
                continue
            isTrue = True
            #连续8帧大于0.2
            for k in range(groupFrameCount):
                if listRate[j + k] < 0.3:
                    isTrue = False
                    break
            if isTrue:
                inGroup = groupFrameCount * 0.8
                groupNp = listsub[framesize * 10 * j : framesize * 10 * (j + groupFrameCount)]
                ExportImg(groupNp, NFFT, framerate, framesize, overlapSize, exportFile, index)
                index += 1

        inGroup = 0
        for j in range(len(listRate) - groupFrameCount):
            if inGroup > 0:
                inGroup -= 1
                continue
            isTrue = True
            #连续8帧大于0.2
            for k in range(groupFrameCount):
                if listRate[j + k] > 0.1:
                    isTrue = False
                    break
            if isTrue:
                inGroup = groupFrameCount
                groupNp = listsub[framesize * 10 * j : framesize * 10 * (j + groupFrameCount)]
                ExportImg(groupNp, NFFT, framerate, framesize, overlapSize, backFile, backIndex)
                backIndex += 1
    return index, backIndex

'''
生成对应的语谱图，制作数据集
'''
def ChangeWavToImg():
    pathName = DataSetPath + '\\'
    backPath = pathName + '0000\\img\\'
    if not os.path.isdir(backPath):
        os.makedirs(backPath)
    #清理
    for file in os.listdir(pathName):
        img_path = pathName + file  + '\\img'
        if not os.path.isdir(img_path):
            os.makedirs(img_path)
        for imgfile in os.listdir(img_path):
            if imgfile.endswith('png'):
                img_file_path = img_path + '\\' + imgfile
                os.remove(img_file_path)
    backIndex = 0
    for file in os.listdir(pathName):
        img_path = pathName + file  #每类图片的地址
        imageIndex = 0
        for imgfile in os.listdir(img_path):
            if imgfile.endswith('wav'):
                y, s = librosa.load(img_path + '\\' + imgfile, sr=11025) # Downsample 44.1kHz to 8kHz
                imageIndex, backIndex = WavToImg(y, img_path + '\\img\\' + imgfile,
                                                 imageIndex, backPath, backIndex)

            if imageIndex > maxClassImgCount:
                break


'''
获取一个batch的训练或测试数据数据，随机获取
此方法用给神经网络调用
'''
def GetBatchData(batchSize = 10, centerSize = -1, Test = False):
    pathName = DataSetPath + '\\'
    labelPaths = os.listdir(pathName)
    labels = []
    datas = []
    centerIndex = random.randint(0, len(labelPaths) - 1)
    for i in range(batchSize):
        pathIndex = random.randint(0, len(labelPaths) - 1)
        if i < centerSize:
            pathIndex = centerIndex
        filePath = labelPaths[pathIndex]
        files = os.listdir(pathName + filePath + '\\img')
        fileIndex = random.randint(0, int(len(files) * 0.7) - 1)
        if Test:
            fileIndex = random.randint(0, int(len(files) * 0.3 + len(files) * 0.7 - 1))
        img = cv2.imread(pathName + filePath + '\\img\\' + files[fileIndex])
        labels.append([pathIndex])
        datas.append(img)
    return labels, datas

#if __name__ == '__main__':
#    ChangeWavToImg()