Google App Script OCR PDF to text Page Number Limitation - pdf

I am very new to Google Script. I have some pdf files in a folder on Google Drive and I am trying to convert pdf to google doc and extract specific text. PDF has more than 200 pages but even the google.doc file is limited to 80 pages. Is there a limit on number of pages you can run OCR on? Or I am missing something....
My code below:
//#####GLOBALS#####
const FOLDER_ID = "1rlAL4WrnxQ6pEY2uOmzWA_csUIDdBjVK"; //Folder ID of all PDFs
const SS = "1XS_YUUdu9FK_bBumK3lFu9fU_M9w7NGydZqOzu9vTyE";//The spreadsheet ID
SHEET = "Extracted";//The sheet tab name
/*########################################################
Main run file: extracts student IDs from PDFs and their
section from the PDF name from multiple documents.
Displays a list of students and sections in a Google Sheet.
*/
function extractInfo(){
const ss = SpreadsheetApp.getActiveSpreadsheet()
//Get all PDF files:
const folder = DriveApp.getFolderById(FOLDER_ID);
//const files = folder.getFiles();
const files = folder.getFilesByType("application/pdf");
let allInfo = []
//Iterate through each folderr
while(files.hasNext()){
Logger.log('first call');
let file = files.next();
let fileID = file.getId();
const doc = getTextFromPDF(fileID);
const invDate = extractInvDate(doc.text);
allInfo = allInfo.concat(invDate);
Logger.log("Length of allInfo array: ")
Logger.log(allInfo.length);
}
importToSpreadsheet(allInfo); //this is 80, even though pdf
//has more than 200 pages with
//required text (invoice date) on each page
};
/*########################################################
* Extracts the text from a PDF and stores it in memory.
* Also extracts the file name.
*
* param {string} : fileID : file ID of the PDF that the text will be extracted from.
*
* returns {array} : Contains the file name and PDF text.
*
*/
function getTextFromPDF(fileID) {
var blob = DriveApp.getFileById(fileID).getBlob()
var resource = {
title: blob.getName(),
mimeType: blob.getContentType()
};
var options = {
ocr: true,
ocrLanguage: "en"
};
// Convert the pdf to a Google Doc with ocr.
var file = Drive.Files.insert(resource, blob, options);
// Get the texts from the newly created text.
var doc = DocumentApp.openById(file.id);
var text = doc.getBody().getText();
var title = doc.getName();
// Deleted the document once the text has been stored.
Drive.Files.remove(doc.getId());
return {
name:title,
text:text
};
}
function extractInvDate(text){
const regexp = /Invoice Date:/g;//commented out \d{2}\/\d{2}\/\d{4}/gi;
try{
let array = [...text.match (regexp)];
return array;
}catch(e){
}
};
function importToSpreadsheet(data){
const sheet = SpreadsheetApp.openById(SS).getSheetByName(SHEET);
const range = sheet.getRange(3,1,data.length,1);
var j = 0;
for (j = 0; j < data.length; j++){
Logger.log(j);
range.getCell(j+1,1).setValue(data[j]);
}
//range.sort([2,1]);
}

The problem or limitation is with the Drive.Files.insert function
When the blob is extracted, fetch the string but it has mime details also...one may need to process it. Sample code is below. modify as per ur need
var blob = DriveApp.getFileById(fileID).getBlob()
var txt = blob.getDataAsString()

Related

Translate an entire page with Google Translate API

I'm using the google Translate API to translate content that's inside an IFRAME. When using the API, I notice that it returns a array(string) with the translated texts, but when inserting it in the DOM, the entire structure disappears, leaving only the text.
The original Page
The result page after the response of the Google API (translate in Portuguese)
This is the code that I'm using:
function translate(lang) {
var iframe = document.getElementById('mainIframe');
var iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
var iframeBody = iframeDoc.body;
var iframeHTML = iframeDoc.documentElement;
var iframeText = iframeBody.innerText || iframeHTML.innerText;
var url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=pt&dt=t&q=' + encodeURI(iframeText);
var xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onreadystatechange = function() {
if (xhr.readyState == 4) {
var response = JSON.parse(xhr.responseText);
var translatedText = ''
for (var i = 0; i < response[0].length; i++) {
translatedText += response[0][i][0];
}
iframeDoc.body.innerHTML = translatedText;
}
};
xhr.send();
}
How can I not affect the structure within the Iframe (Not to be like the second image I sent above)? Just replace the original text with what comes in the response and leave the original look.
I've already tried using functions to find the original text in the DOM to replace it with the translated text but without success. In this case, the idea was just to replace the text correspondence literally.
Thanks for your help and thanks in advance

How to connect Dear inventory APIs and Google sheet?

I would like to connect the Dear inventory system with Google Sheets for importing product lists, Sale lists data, etc by using both the Dear API and Google Sheets API.
It should be automatically updated.
Is it possible or not?
If yes, Any methods here?
Sure this is possible. With this you have a generic setup that will NOT handle nested objects or arrays. But you can create specific functions with the data you want. But for that you should hire somebody, or do the work yourself.
The setup is that you can use the generic function to get the endpoint you want to the sheet you want. dearAPI(endpoint, sheetname, dig)
Dig: This is the object that holds the array of the returned data:
The code:
//GLOBAL variables:
const accountID = 'id';
const secret = 'secret';
function onOpen(e) {
SpreadsheetApp.getUi().createMenu('DEAR')
.addItem('Update all', 'updateAll')
.addToUi();
}
function updateAll(){
dearAPI('product','Products','Products');
dearAPI('salesList','Sales', 'SalesList');
}
/**
* Updates data from specific DEAR endpoint to specific sheet.
*
* #param {string} endpoint - the endpoint the get the data from.
* #param {string} sheetname - name of the sheet to write the data to.
* #param {string} dig - the object where the array of data is returned.
* #return {void}
*/
function dearAPI(endpoint, sheetname, dig) {
const ss = SpreadsheetApp.getActiveSpreadsheet();
const sheet = ss.getSheetByName(sheetname)
const dataRows = [];
let pagenumber = 1;
do {
const url = `https://inventory.dearsystems.com/externalapi/v2/${endpoint}?page=${pagenumber}&limit=1000`;
const headers = {
"api-auth-accountid": accountID,
"api-auth-applicationkey": secret
};
const options = {
"method" : "get",
"headers" : headers
};
const response = UrlFetchApp.fetch(url, options);
const data = JSON.parse(response.getContentText())
data[dig].forEach(item => dataRows.push(item));
pagenumber++
} while (data.Total != data.Page)
const rowHeaders = Object.keys(dataRows[0]);
const rows = [rowHeaders];
for (let i = 0; i < dataRows.length; i++) {
const rowData = [];
for (let j = 0; j < rowHeaders.length; j++) {
rowData.push(dataRows[i][rowHeaders[j]]);
}
rows.push(rowData);
}
sheet.getRange(1,1,sheet.getLastRow(), sheet.getLastColumn()).clearContent();
sheet.getRange(1,1,rows.length,rows[0].length).setValues(rows);
}

How do I get specific value from JSON string in Google Script?

I'm trying to extract a specific value from this json file:
An example value I'm looking for is exDividendDate, fmt : 2020-09-24.
The code I've written to extract the value doesn't doesn't extract this or any other value and I'm not sure why. Any help would be greatly appreciated.
The error I get in the Google Apps Script is:
TypeError: Cannot read property 'earningsDate' of undefined (line 44,
file "Stock Database"
function callAPI(symbol) {
// Call the API
var url = 'https://query2.finance.yahoo.com/v10/finance/quoteSummary/'
var modules = "?modules=calendarEvents"
var response = UrlFetchApp.fetch(url + symbol + modules);
// Parse the JSON reply
var json = response.getContentText();
var data = JSON.parse(json);
console.log(data)
return JSON.parse(json)
}
function displayFinancials() {
// Load sheets
var dataSheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Results");
var modelSheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Financial Ratios Model");
// Get model input data
var company = "Apple"
var symbol = "AAPL"
// call the API
var api = callAPI(symbol);
var results = api[0];
// Output the API result
var output = [company, symbol, results.exDividendDate.fmt]
console.log(output);
dataSheet.appendRow(output)
}
When I saw the JSON data, it seems that exDividendDate is callAPI(symbol).quoteSummary.result[0].calendarEvents. So how about the following modification?
From:
var results = api[0];
To:
var results = api.quoteSummary.result[0].calendarEvents;

Convert all the sheets to pdf every 8 hour or by changing a cell value with Google Apps Script

I just started writing script and I'm trying to convert a Google spreadsheet (multiple sheets) to a PDF file by changing cell value (example A1, i try but i didn't obtain nothing) or each 8 hr. The script below works, but it only creates a PDF when i run the script. I've trying to add a project's trigger: time-based each 8 hr but i received an email: Bad value (line 4, file "Code") so in the folder there were no new pdf files.
function savePDFs( optSSId, optSheetId ) {
// If a sheet ID was provided, open that sheet, otherwise assume script is
// sheet-bound, and open the active spreadsheet.
var ss = (optSSId) ? SpreadsheetApp.openById(optSSId) :
SpreadsheetApp.getActiveSpreadsheet();
// Get folder containing spreadsheet, for later export
var parents = DriveApp.getFileById(ss.getId()).getParents();
if (parents.hasNext()) {
var folder = parents.next();
}
else {
folder = DriveApp.getRootFolder();
}
//additional parameters for exporting the sheet as a pdf
var url_ext = 'export?exportFormat=pdf&format=pdf' //export as pdf
// Print either the entire Spreadsheet or the specified sheet if
optSheetId is provided
+ (optSheetId ? ('&gid=' + sheet.getSheetId()) : ('&id=' + ss.getId()))
// following parameters are optional...
+ '&size=letter' // paper size
+ '&portrait=true' // orientation, false for landscape
+ '&fitw=true' // fit to width, false for actual size
+ '&sheetnames=false&printtitle=false&pagenumbers=false' //hide optional
headers and footers
+ '&gridlines=false' // hide gridlines
+ '&fzr=false'; // do not repeat row headers (frozen rows) on each
page
var options = {
headers: {
'Authorization': 'Bearer ' + ScriptApp.getOAuthToken()
}
}
var date = Utilities.formatDate8new Date(). "GMT+2" ""hh:mm dd/MM/yyyy")
var response = UrlFetchApp.fetch("https://docs.google.com/spreadsheets/" +
url_ext, options);
var blob = response.getBlob().setName(ss.getName() + date + '.pdf');
//from here you should be able to use and manipulate the blob to send and
email or create a file per usual.
//In this example, I save the pdf to drive
folder.createFile(blob);
}
When i tried to run the script automatically changing the cell value i wrote another script reported below (don't work):
function onEdit(e) {
var langName = 'Sheet1'
var langCell = 'A1'
var curSheet = e.range.getSheet()
if (curSheet.getName() === langName) {
if (e.range.getA1Notation() === langCell) {
savePDFs()
}
}

Google App Script - Save Spreadsheet to PDF saved on Google Drive

I'm trying to save all of the sheets on my spreadsheet to google drive as one PDF (ultimately I would like to have them email as well). I'm having trouble saving more than just one of the sheets. I've tried multiple way of doing it. The code below is the best way I've found so far. Again the problem is that it only saves the first page as a PDF, I cant figure out how to get around the delete redundant sheets. All the posts I have seen only want to save 1 page, I have over 24 pages that need to be saved as one PDF. Thanks in advance for your help!
function PDF() {
var sheetName = SpreadsheetApp.getActiveSpreadsheet();
var folderID = "*** Google Drive ID***"; // Folder id to save in a Drive folder.
var ss = SpreadsheetApp.openByUrl(
'https://docs.google.com/spreadsheets/d/***Spreadsheet ID***');
var pdfName = "MAR - " + ss.getRange("A1:A1").getValue(); //Need to set the values to another sheet
var sourceSpreadsheet = SpreadsheetApp.getActive();
var sourceSheet = sourceSpreadsheet.getSheetByName(sheetName);
var folder = DriveApp.getFolderById(folderID);
//Copy whole spreadsheet
var destSpreadsheet = SpreadsheetApp.open(DriveApp.getFileById(sourceSpreadsheet.getId()).makeCopy("tmp_convert_to_pdf", folder))
//delete redundant sheets
var sheets = destSpreadsheet.getSheets();
for (i = 0; i < sheets.length; i++) {
if (sheets[i].getSheetName() != sheetName){
destSpreadsheet.deleteSheet(sheets[i]);
}
}
var destSheet = destSpreadsheet.getSheets()[0];
//repace cell values with text (to avoid broken references)
var sourceRange = sourceSheet.getRange(1,1,sourceSheet.getMaxRows(),sourceSheet.getMaxColumns());
var sourcevalues = sourceRange.getValues();
var destRange = destSheet.getRange(1, 1, destSheet.getMaxRows(), destSheet.getMaxColumns());
destRange.setValues(sourcevalues);
//save to pdf
var theBlob = destSpreadsheet.getBlob().getAs('application/pdf').setName(pdfName);
var newFile = folder.createFile(theBlob);
//Delete the temporary sheet
DriveApp.getFileById(destSpreadsheet.getId()).setTrashed(true);
}
By using Drive API, you can convert from a spreadsheet to a PDF which has all sheets in the spreadsheet. In order to use this, so please enable Drive API on Google API Console as follows.
In the script editor, select Resources > Cloud Platform Project
At the bottom of the dialog, click the link for the Google API Console.
In the console, click into the filter box and type part of the name of the API "Drive API", then click the name once you see it.
On the next screen, click Enable API.
Close the Developers Console and return to the script editor. Click OK in the dialog.
I prepared a sample script for creating PDF file from spreadsheet. Please use this to your script.
Script :
var spreadsheetId = "#####";
var folderId = "#####";
var outputFilename = "#####";
var url = "https://www.googleapis.com/drive/v3/files/" + spreadsheetId + "/export?mimeType=application/pdf";
var options = {
method: "GET",
headers: {Authorization: "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions: true
};
var response = UrlFetchApp.fetch(url, options).getBlob();
DriveApp.getFolderById(folderId).createFile(response).setName(outputFilename);
About this script, although I confirmed this works fine, if it doesn't work at your environment, please tell me. And if I misunderstand your question, I'm sorry.
Added 1 :
function PDF() {
var sheetName = SpreadsheetApp.getActiveSpreadsheet();
var folderID = "*** Google Drive ID***"; // Folder id to save in a Drive folder.
var ss = SpreadsheetApp.openByUrl(
'https://docs.google.com/spreadsheets/d/***Spreadsheet ID***');
var pdfName = "MAR - " + ss.getRange("A1:A1").getValue(); //Need to set the values to another sheet
var sourceSpreadsheet = SpreadsheetApp.getActive();
var sourceSheet = sourceSpreadsheet.getSheetByName(sheetName);
var folder = DriveApp.getFolderById(folderID);
//Copy whole spreadsheet
var destSpreadsheet = SpreadsheetApp.open(DriveApp.getFileById(sourceSpreadsheet.getId()).makeCopy("tmp_convert_to_pdf", folder))
//delete redundant sheets
var sheets = destSpreadsheet.getSheets();
for (i = 0; i < sheets.length; i++) {
if (sheets[i].getSheetName() != sheetName){
destSpreadsheet.deleteSheet(sheets[i]);
}
}
var destSheet = destSpreadsheet.getSheets()[0];
//repace cell values with text (to avoid broken references)
var sourceRange = sourceSheet.getRange(1,1,sourceSheet.getMaxRows(),sourceSheet.getMaxColumns());
var sourcevalues = sourceRange.getValues();
var destRange = destSheet.getRange(1, 1, destSheet.getMaxRows(), destSheet.getMaxColumns());
destRange.setValues(sourcevalues);
//save to pdf
// var theBlob = destSpreadsheet.getBlob().getAs('application/pdf').setName(pdfName);
// var newFile = folder.createFile(theBlob);
// A sample script was added here.
var url = "https://www.googleapis.com/drive/v3/files/" + destSpreadsheet.getId() + "/export?mimeType=application/pdf";
var options = {
method: "GET",
headers: {Authorization: "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions: true
};
var response = UrlFetchApp.fetch(url, options).getBlob();
DriveApp.getFolderById(folderID).createFile(response).setName(pdfName);
//Delete the temporary sheet
DriveApp.getFileById(destSpreadsheet.getId()).setTrashed(true);
}
Added 2 :
function PDF() {
var folderID = "*** Google Drive ID***"; // Folder id to save in a Drive folder.
var ss = SpreadsheetApp.openByUrl(
'https://docs.google.com/spreadsheets/d/***Spreadsheet ID***');
var pdfName = "MAR - " + ss.getRange("A1:A1").getValue(); //Need to set the values to another sheet
var sourceSpreadsheet = SpreadsheetApp.getActive();
var folder = DriveApp.getFolderById(folderID);
// A sample script was added here.
var url = "https://www.googleapis.com/drive/v3/files/" + sourceSpreadsheet.getId() + "/export?mimeType=application/pdf";
var options = {
method: "GET",
headers: {Authorization: "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions: true
};
var response = UrlFetchApp.fetch(url, options).getBlob();
DriveApp.getFolderById(folderID).createFile(response).setName(pdfName);
}
Here is the function that creates PDF file from google sheet and function that moves new created file into the folder that id you give to the parameter of the function.
function downloadPDF(fileId, folderId) {
var file = Drive.Files.get(fileId);
var url = file.exportLinks[MimeType.PDF];
var options = {
headers: {
Authorization:"Bearer " + ScriptApp.getOAuthToken()
},
muteHttpExceptions : true
}
var response = UrlFetchApp.fetch(url, options);
var status = response.getResponseCode();
var result = response.getContentText();
if (status != 200) {
// Get additional error message info, depending on format
if (result.toUpperCase().indexOf("<HTML") !== -1) {
var message = strip_tags(result);
}
else if (result.indexOf('errors') != -1) {
message = JSON.parse(result).error.message;
}
throw new Error('Error (' + status + ") " + message );
}
var doc = response.getBlob();
var newFileid = DriveApp.createFile(doc).setName(file.title + '.pdf').getId();
let id = moveFileTo(newFileid, folderId);
return id;
}
function moveFileTo(sourceId, folderId){
let file = DriveApp.getFileById(sourceId)
let blob = file.getBlob();
let id = DriveApp.getFolderById(folderId).createFile(blob).getId();
file.setTrashed(true)
return id;
}