This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"" DetectCyrillic file search content in file recursiverly and filters all the files | |
having Cyrillic character in it preserving the folder structure. | |
Supported files: Docx Pptx Xlsx txt | |
Instruction to use: | |
This script expects user to provide following information through command line argument | |
1) Source directory where the file will be searched. | |
2) Target directory where the matched result will be copied. | |
Note: This script expects that no two file have same name and extension , | |
they will be replaced in target directory otherwise | |
Eg to run the code | |
Python DetectCyrillic.py sourceFolder targetFolder | |
""" | |
import os | |
import sys | |
from shutil import copy | |
import logging | |
import docx2txt | |
import fulltext | |
import re | |
from colorama import Fore, init | |
init() | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(stream=sys.stderr, level=logging.INFO) | |
def has_cyrillic(text): | |
return bool(re.search('[\u0400-\u04FF]', text)) | |
def copyErrorFiles(file, dest) : | |
""" Copies files to the Error folder if unable to parse the document | |
""" | |
if not os.path.isdir(dest): | |
os.makedirs(dest) | |
listDir = os.listdir(dest) | |
finalDir = os.path.join(dest, "Error") | |
if "Error" in listDir: | |
copy(file, finalDir) | |
else: | |
os.makedirs(finalDir) | |
copy(file, finalDir) | |
def processTxtfile(file,dest): | |
""" | |
Process text file and returns true if file contains cyrillic character. """ try: | |
with open(file, encoding="utf-8", errors='ignore') as f: | |
for line in f: | |
russian = has_cyrillic(line) | |
if russian: | |
return True return False | |
except Exception as e: | |
print(Fore.RED +"cannot process txt file %s" % file) | |
copyErrorFiles(file, dest) | |
def processDocxfile(file,dest): | |
" | |
Process docx file and returns true if file contains cyrillic character. """ | |
try: | |
doc = docx2txt.process(file) | |
russian = has_cyrillic(doc) | |
return russian | |
except Exception as e: | |
print(Fore.RED +"cannot process docx file %s" % file) | |
copyErrorFiles(file, dest) | |
def processPptxfile(file,dest): | |
""" | |
Process pptx file and returns true if file contains cyrillic character. """ | |
try: | |
doc = fulltext.get(file, encoding="utf-8", errors='ignore') | |
russian = has_cyrillic(doc) | |
return russian | |
except Exception as e: | |
print(Fore.RED +"cannot process pptx file %s" % file) | |
copyErrorFiles(file, dest) | |
def processExcelfile(file,dest): | |
""" | |
Process xlsx file and returns true if file contains cyrillic character. """ | |
try: | |
doc = fulltext.get(file, encoding="utf-8", errors='ignore') | |
russian = has_cyrillic(doc) | |
return russian | |
except Exception as e: | |
print(Fore.RED +"cannot process file %s" % file) | |
copyErrorFiles(file, dest) | |
def detectCyrillic(file,dest): | |
ext = (file.split(".")[-1]).lower() | |
if ext == "txt": | |
lang = processTxtfile(file, dest) | |
elif ext == "docx": | |
lang = processDocxfile(file, dest) | |
elif ext == "pptx": | |
lang = processPptxfile(file, dest) | |
elif ext == "xlsx": | |
lang = processExcelfile(file, dest) | |
else : | |
print(Fore.RED + "Invalid Extension %s " % (ext)) | |
copyErrorFiles(file, dest) | |
lang = False return lang | |
def findAll(src,dest): | |
for root, dirs, files in os.walk(src): | |
for x in files: | |
print(Fore.WHITE + "Start : Detect Cryllic %s " %(x)) | |
srcfilePath = os.path.join(root, x) | |
rootdest = root.replace(src, dest) | |
langSet = detectCyrillic(srcfilePath,rootdest) | |
if langSet: | |
if os.path.isdir(rootdest): | |
copy(srcfilePath, rootdest) | |
else : | |
os.makedirs(rootdest) | |
copy(srcfilePath, rootdest) | |
print(Fore.WHITE + "End : %s " % (x)) | |
if __name__ == "__main__": | |
findAll(sys.argv[1], sys.argv[2]) |