Saturday, May 4, 2019

Filter Cryillic file from the folder in python


"" DetectCyrillic file search content in file recursiverly and filters all the files
having Cyrillic character in it preserving the folder structure.
Supported files: Docx Pptx Xlsx txt
Instruction to use:
This script expects user to provide following information through command line argument
1) Source directory where the file will be searched.
2) Target directory where the matched result will be copied.
Note: This script expects that no two file have same name and extension ,
they will be replaced in target directory otherwise
Eg to run the code
Python DetectCyrillic.py sourceFolder targetFolder
"""
import os
import sys
from shutil import copy
import logging
import docx2txt
import fulltext
import re
from colorama import Fore, init
init()
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
def has_cyrillic(text):
return bool(re.search('[\u0400-\u04FF]', text))
def copyErrorFiles(file, dest) :
""" Copies files to the Error folder if unable to parse the document
"""
if not os.path.isdir(dest):
os.makedirs(dest)
listDir = os.listdir(dest)
finalDir = os.path.join(dest, "Error")
if "Error" in listDir:
copy(file, finalDir)
else:
os.makedirs(finalDir)
copy(file, finalDir)
def processTxtfile(file,dest):
"""
Process text file and returns true if file contains cyrillic character. """ try:
with open(file, encoding="utf-8", errors='ignore') as f:
for line in f:
russian = has_cyrillic(line)
if russian:
return True return False
except Exception as e:
print(Fore.RED +"cannot process txt file %s" % file)
copyErrorFiles(file, dest)
def processDocxfile(file,dest):
"
Process docx file and returns true if file contains cyrillic character. """
try:
doc = docx2txt.process(file)
russian = has_cyrillic(doc)
return russian
except Exception as e:
print(Fore.RED +"cannot process docx file %s" % file)
copyErrorFiles(file, dest)
def processPptxfile(file,dest):
"""
Process pptx file and returns true if file contains cyrillic character. """
try:
doc = fulltext.get(file, encoding="utf-8", errors='ignore')
russian = has_cyrillic(doc)
return russian
except Exception as e:
print(Fore.RED +"cannot process pptx file %s" % file)
copyErrorFiles(file, dest)
def processExcelfile(file,dest):
"""
Process xlsx file and returns true if file contains cyrillic character. """
try:
doc = fulltext.get(file, encoding="utf-8", errors='ignore')
russian = has_cyrillic(doc)
return russian
except Exception as e:
print(Fore.RED +"cannot process file %s" % file)
copyErrorFiles(file, dest)
def detectCyrillic(file,dest):
ext = (file.split(".")[-1]).lower()
if ext == "txt":
lang = processTxtfile(file, dest)
elif ext == "docx":
lang = processDocxfile(file, dest)
elif ext == "pptx":
lang = processPptxfile(file, dest)
elif ext == "xlsx":
lang = processExcelfile(file, dest)
else :
print(Fore.RED + "Invalid Extension %s " % (ext))
copyErrorFiles(file, dest)
lang = False return lang
def findAll(src,dest):
for root, dirs, files in os.walk(src):
for x in files:
print(Fore.WHITE + "Start : Detect Cryllic %s " %(x))
srcfilePath = os.path.join(root, x)
rootdest = root.replace(src, dest)
langSet = detectCyrillic(srcfilePath,rootdest)
if langSet:
if os.path.isdir(rootdest):
copy(srcfilePath, rootdest)
else :
os.makedirs(rootdest)
copy(srcfilePath, rootdest)
print(Fore.WHITE + "End : %s " % (x))
if __name__ == "__main__":
findAll(sys.argv[1], sys.argv[2])