I need to extract text from PDF files stored in Azure Storage Blob. I can read the filenames, but I can't extract their content:
import os import uuid import sys from azure.storage.blob import BlockBlobService, PublicAccess # Create the BlockBlockService that the system uses to call the Blob service for the storage account. block_blob_service = BlockBlobService( account_name='servicestorageblob', account_key='nxJHeYjKRM+k1JTGd9OSCDwnGhoDhJtabWH2iY/owttklUYv8LaGK8ZwYTQENC6fnGJT4BCNR6mkm8tK1fcNDA==') # Create a container called 'quickstartblobs'. container_name = 'dataset' block_blob_service.create_container(container_name) # Set the permission so the blobs are public. block_blob_service.set_container_acl( container_name, public_access=PublicAccess.Container) # List the blobs in the container. print("\nList blobs in the container") generator = block_blob_service.list_blobs(container_name) for blob in generator: print("\t Blob name: " + blob.name) # Download the blob to a local file # Add 'DOWNLOAD' before the .txt extension so you can see both files in the data directory download_file_path = os.path.join(local_path, str.replace(local_file_name ,'.pdf', blob.name)) print("\nDownloading blob to \n\t" + download_file_path) with open(download_file_path, "wb") as download_file: download_file.write(blob_client.download_blob().readall())
I tried
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)