id3-unicodify.py : id3-unicodify-1.0.0.py

Download Python Script
#!/bin/python3
'''
    id3-unicodify fixes non-ASCII ID3 tags that where wrongly stored as Latin-1
    by misconfigured software. For usage information run with -h/--help.

    Copyright (C) 2018 dkr <https://tarxjf.info> <dkr _at_ tarxjf.info>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''

import argparse
import os
from mutagen.easyid3 import EasyID3
from mutagen import MutagenError
from bs4 import UnicodeDammit


def tagToString(tag_value):
    """Turns alphanum dict values into str for encoding manipulation"""
    try:
        orig_str = ''.join(tag_value).encode(source_enc)
    except UnicodeEncodeError:
        """If source_enc is not latin-1 assume it's utf-8 just pass it along"""
        orig_str = ''.join(tag_value)
    return orig_str

def valueReplace(f):
    """Replaces original values with target values in a given file"""
    for key, value in f.items():
        orig = tagToString(value)
        target = unicodify(orig)
        if target['source_enc'] is None:
            pass
        elif target['source_enc'] in ["ascii", "iso-8859-1", "latin-1"]:
            #print('{} "{}" looks like {}. No need to change'.format(
            #    key, ''.join(value), target['source_enc']))
            pass
        else:
            print('{} "{}" looks like {}. It will become "{}"'.format(
                key, ''.join(value), target['source_enc'], target['uni_markup']))
            f[key] = target['uni_markup']

def openFile(filename):
    """Loads audio file"""
    metadata = EasyID3(filename)
    print("Loaded file: {}.".format(filename))
    return metadata

def travDir(workingdir):
    """If source path is dir, return list of files"""
    filelist = []
    for root, dirs, files in os.walk(workingdir):
        if not files:
            continue
        for f in files:
            filepath = os.path.join(root, f)
            #print("Found file", filepath)
            filelist.append(filepath)
    return filelist

def unicodify(string):
    language = UnicodeDammit(string, [suspects])
    return {'source_enc': language.original_encoding,
            'uni_markup': language.unicode_markup}


parser = argparse.ArgumentParser(description='')
source = parser.add_mutually_exclusive_group(required=True)
source.add_argument('-d', '--dir', help='Source directory path')
source.add_argument('-f', '--file', help='Source file')
parser.add_argument('-c', '--codepage',
                    help='Suspected encodings to aid detection. \
                    Comma-seperated list.')
parser.add_argument('--dryrun', help='Do not actually write changes to file',
                    action='store_true')
args = parser.parse_args()

source_enc = 'latin-1' # Default scenario
suspects = args.codepage

print(__doc__)

if args.dryrun:
    print("This is a dry run. No changes will be written.")
else:
    print("Changes will be written.")

if args.file is not None:
    audiofile = openFile(args.file)
    valueReplace(audiofile)
else:
    filelist = travDir(args.dir)
    for f in filelist:
        try:
            audiofile = openFile(f)
            valueReplace(audiofile)
            if args.dryrun:
                #print("This was a dry run. No changes were written.")
                print("================")
            else:
                #print("Changes will be written to file.")
                print("================")
                audiofile.save()
        except MutagenError:
            #print("================")
            print("Skipping file {}. No ID3 tags or incompatible format."
                  .format(f))
            print("================")
            continue
Category: Releases